Merge https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

+3

arch/alpha/include/uapi/asm/socket.h

··· 124 124 125 125 #define SO_DETACH_REUSEPORT_BPF 68 126 126 127 + #define SO_PREFER_BUSY_POLL 69 128 + #define SO_BUSY_POLL_BUDGET 70 129 + 127 130 #if !defined(__KERNEL__) 128 131 129 132 #if __BITS_PER_LONG == 64

+3

arch/mips/include/uapi/asm/socket.h

··· 135 135 136 136 #define SO_DETACH_REUSEPORT_BPF 68 137 137 138 + #define SO_PREFER_BUSY_POLL 69 139 + #define SO_BUSY_POLL_BUDGET 70 140 + 138 141 #if !defined(__KERNEL__) 139 142 140 143 #if __BITS_PER_LONG == 64

+3

arch/parisc/include/uapi/asm/socket.h

··· 116 116 117 117 #define SO_DETACH_REUSEPORT_BPF 0x4042 118 118 119 + #define SO_PREFER_BUSY_POLL 0x4043 120 + #define SO_BUSY_POLL_BUDGET 0x4044 121 + 119 122 #if !defined(__KERNEL__) 120 123 121 124 #if __BITS_PER_LONG == 64

+3

arch/sparc/include/uapi/asm/socket.h

··· 117 117 118 118 #define SO_DETACH_REUSEPORT_BPF 0x0047 119 119 120 + #define SO_PREFER_BUSY_POLL 0x0048 121 + #define SO_BUSY_POLL_BUDGET 0x0049 122 + 120 123 #if !defined(__KERNEL__) 121 124 122 125

+1 -1

drivers/net/ethernet/amazon/ena/ena_netdev.c

··· 416 416 { 417 417 int rc; 418 418 419 - rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid); 419 + rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid, 0); 420 420 421 421 if (rc) { 422 422 netif_err(rx_ring->adapter, ifup, rx_ring->netdev,

+1 -1

drivers/net/ethernet/broadcom/bnxt/bnxt.c

··· 2884 2884 if (rc) 2885 2885 return rc; 2886 2886 2887 - rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i); 2887 + rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i, 0); 2888 2888 if (rc < 0) 2889 2889 return rc; 2890 2890

+1 -1

drivers/net/ethernet/cavium/thunder/nicvf_queues.c

··· 770 770 rq->caching = 1; 771 771 772 772 /* Driver have no proper error path for failed XDP RX-queue info reg */ 773 - WARN_ON(xdp_rxq_info_reg(&rq->xdp_rxq, nic->netdev, qidx) < 0); 773 + WARN_ON(xdp_rxq_info_reg(&rq->xdp_rxq, nic->netdev, qidx, 0) < 0); 774 774 775 775 /* Send a mailbox msg to PF to config RQ */ 776 776 mbx.rq.msg = NIC_MBOX_MSG_RQ_CFG;

+1 -1

drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c

··· 3334 3334 return 0; 3335 3335 3336 3336 err = xdp_rxq_info_reg(&fq->channel->xdp_rxq, priv->net_dev, 3337 - fq->flowid); 3337 + fq->flowid, 0); 3338 3338 if (err) { 3339 3339 dev_err(dev, "xdp_rxq_info_reg failed\n"); 3340 3340 return err;

+12 -1

drivers/net/ethernet/intel/i40e/i40e_txrx.c

··· 676 676 i40e_clean_tx_ring(tx_ring); 677 677 kfree(tx_ring->tx_bi); 678 678 tx_ring->tx_bi = NULL; 679 + kfree(tx_ring->xsk_descs); 680 + tx_ring->xsk_descs = NULL; 679 681 680 682 if (tx_ring->desc) { 681 683 dma_free_coherent(tx_ring->dev, tx_ring->size, ··· 1279 1277 if (!tx_ring->tx_bi) 1280 1278 goto err; 1281 1279 1280 + if (ring_is_xdp(tx_ring)) { 1281 + tx_ring->xsk_descs = kcalloc(I40E_MAX_NUM_DESCRIPTORS, sizeof(*tx_ring->xsk_descs), 1282 + GFP_KERNEL); 1283 + if (!tx_ring->xsk_descs) 1284 + goto err; 1285 + } 1286 + 1282 1287 u64_stats_init(&tx_ring->syncp); 1283 1288 1284 1289 /* round up to nearest 4K */ ··· 1309 1300 return 0; 1310 1301 1311 1302 err: 1303 + kfree(tx_ring->xsk_descs); 1304 + tx_ring->xsk_descs = NULL; 1312 1305 kfree(tx_ring->tx_bi); 1313 1306 tx_ring->tx_bi = NULL; 1314 1307 return -ENOMEM; ··· 1447 1436 /* XDP RX-queue info only needed for RX rings exposed to XDP */ 1448 1437 if (rx_ring->vsi->type == I40E_VSI_MAIN) { 1449 1438 err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, 1450 - rx_ring->queue_index); 1439 + rx_ring->queue_index, rx_ring->q_vector->napi.napi_id); 1451 1440 if (err < 0) 1452 1441 return err; 1453 1442 }

+1

drivers/net/ethernet/intel/i40e/i40e_txrx.h

··· 389 389 struct i40e_channel *ch; 390 390 struct xdp_rxq_info xdp_rxq; 391 391 struct xsk_buff_pool *xsk_pool; 392 + struct xdp_desc *xsk_descs; /* For storing descriptors in the AF_XDP ZC path */ 392 393 } ____cacheline_internodealigned_in_smp; 393 394 394 395 static inline bool ring_uses_build_skb(struct i40e_ring *ring)

+82 -37

drivers/net/ethernet/intel/i40e/i40e_xsk.c

··· 2 2 /* Copyright(c) 2018 Intel Corporation. */ 3 3 4 4 #include <linux/bpf_trace.h> 5 + #include <linux/stringify.h> 5 6 #include <net/xdp_sock_drv.h> 6 7 #include <net/xdp.h> 7 8 ··· 381 380 return failure ? budget : (int)total_rx_packets; 382 381 } 383 382 383 + static void i40e_xmit_pkt(struct i40e_ring *xdp_ring, struct xdp_desc *desc, 384 + unsigned int *total_bytes) 385 + { 386 + struct i40e_tx_desc *tx_desc; 387 + dma_addr_t dma; 388 + 389 + dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc->addr); 390 + xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc->len); 391 + 392 + tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use++); 393 + tx_desc->buffer_addr = cpu_to_le64(dma); 394 + tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC | I40E_TX_DESC_CMD_EOP, 395 + 0, desc->len, 0); 396 + 397 + *total_bytes += desc->len; 398 + } 399 + 400 + static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *desc, 401 + unsigned int *total_bytes) 402 + { 403 + u16 ntu = xdp_ring->next_to_use; 404 + struct i40e_tx_desc *tx_desc; 405 + dma_addr_t dma; 406 + u32 i; 407 + 408 + loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) { 409 + dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr); 410 + xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc[i].len); 411 + 412 + tx_desc = I40E_TX_DESC(xdp_ring, ntu++); 413 + tx_desc->buffer_addr = cpu_to_le64(dma); 414 + tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC | 415 + I40E_TX_DESC_CMD_EOP, 416 + 0, desc[i].len, 0); 417 + 418 + *total_bytes += desc[i].len; 419 + } 420 + 421 + xdp_ring->next_to_use = ntu; 422 + } 423 + 424 + static void i40e_fill_tx_hw_ring(struct i40e_ring *xdp_ring, struct xdp_desc *descs, u32 nb_pkts, 425 + unsigned int *total_bytes) 426 + { 427 + u32 batched, leftover, i; 428 + 429 + batched = nb_pkts & ~(PKTS_PER_BATCH - 1); 430 + leftover = nb_pkts & (PKTS_PER_BATCH - 1); 431 + for (i = 0; i < batched; i += PKTS_PER_BATCH) 432 + i40e_xmit_pkt_batch(xdp_ring, &descs[i], total_bytes); 433 + for (i = batched; i < batched + leftover; i++) 434 + i40e_xmit_pkt(xdp_ring, &descs[i], total_bytes); 435 + } 436 + 437 + static void i40e_set_rs_bit(struct i40e_ring *xdp_ring) 438 + { 439 + u16 ntu = xdp_ring->next_to_use ? xdp_ring->next_to_use - 1 : xdp_ring->count - 1; 440 + struct i40e_tx_desc *tx_desc; 441 + 442 + tx_desc = I40E_TX_DESC(xdp_ring, ntu); 443 + tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS << I40E_TXD_QW1_CMD_SHIFT); 444 + } 445 + 384 446 /** 385 447 * i40e_xmit_zc - Performs zero-copy Tx AF_XDP 386 448 * @xdp_ring: XDP Tx ring ··· 453 389 **/ 454 390 static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget) 455 391 { 456 - unsigned int sent_frames = 0, total_bytes = 0; 457 - struct i40e_tx_desc *tx_desc = NULL; 458 - struct i40e_tx_buffer *tx_bi; 459 - struct xdp_desc desc; 460 - dma_addr_t dma; 392 + struct xdp_desc *descs = xdp_ring->xsk_descs; 393 + u32 nb_pkts, nb_processed = 0; 394 + unsigned int total_bytes = 0; 461 395 462 - while (budget-- > 0) { 463 - if (!xsk_tx_peek_desc(xdp_ring->xsk_pool, &desc)) 464 - break; 396 + nb_pkts = xsk_tx_peek_release_desc_batch(xdp_ring->xsk_pool, descs, budget); 397 + if (!nb_pkts) 398 + return false; 465 399 466 - dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc.addr); 467 - xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, 468 - desc.len); 469 - 470 - tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use]; 471 - tx_bi->bytecount = desc.len; 472 - 473 - tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use); 474 - tx_desc->buffer_addr = cpu_to_le64(dma); 475 - tx_desc->cmd_type_offset_bsz = 476 - build_ctob(I40E_TX_DESC_CMD_ICRC 477 - | I40E_TX_DESC_CMD_EOP, 478 - 0, desc.len, 0); 479 - 480 - sent_frames++; 481 - total_bytes += tx_bi->bytecount; 482 - 483 - xdp_ring->next_to_use++; 484 - if (xdp_ring->next_to_use == xdp_ring->count) 485 - xdp_ring->next_to_use = 0; 400 + if (xdp_ring->next_to_use + nb_pkts >= xdp_ring->count) { 401 + nb_processed = xdp_ring->count - xdp_ring->next_to_use; 402 + i40e_fill_tx_hw_ring(xdp_ring, descs, nb_processed, &total_bytes); 403 + xdp_ring->next_to_use = 0; 486 404 } 487 405 488 - if (tx_desc) { 489 - /* Request an interrupt for the last frame and bump tail ptr. */ 490 - tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS << 491 - I40E_TXD_QW1_CMD_SHIFT); 492 - i40e_xdp_ring_update_tail(xdp_ring); 406 + i40e_fill_tx_hw_ring(xdp_ring, &descs[nb_processed], nb_pkts - nb_processed, 407 + &total_bytes); 493 408 494 - xsk_tx_release(xdp_ring->xsk_pool); 495 - i40e_update_tx_stats(xdp_ring, sent_frames, total_bytes); 496 - } 409 + /* Request an interrupt for the last frame and bump tail ptr. */ 410 + i40e_set_rs_bit(xdp_ring); 411 + i40e_xdp_ring_update_tail(xdp_ring); 497 412 498 - return !!budget; 413 + i40e_update_tx_stats(xdp_ring, nb_pkts, total_bytes); 414 + 415 + return true; 499 416 } 500 417 501 418 /**

+16

drivers/net/ethernet/intel/i40e/i40e_xsk.h

··· 4 4 #ifndef _I40E_XSK_H_ 5 5 #define _I40E_XSK_H_ 6 6 7 + /* This value should match the pragma in the loop_unrolled_for 8 + * macro. Why 4? It is strictly empirical. It seems to be a good 9 + * compromise between the advantage of having simultaneous outstanding 10 + * reads to the DMA array that can hide each others latency and the 11 + * disadvantage of having a larger code path. 12 + */ 13 + #define PKTS_PER_BATCH 4 14 + 15 + #ifdef __clang__ 16 + #define loop_unrolled_for _Pragma("clang loop unroll_count(4)") for 17 + #elif __GNUC__ >= 8 18 + #define loop_unrolled_for _Pragma("GCC unroll 4") for 19 + #else 20 + #define loop_unrolled_for for 21 + #endif 22 + 7 23 struct i40e_vsi; 8 24 struct xsk_buff_pool; 9 25 struct zero_copy_allocator;

+2 -2

drivers/net/ethernet/intel/ice/ice_base.c

··· 306 306 if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) 307 307 /* coverity[check_return] */ 308 308 xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, 309 - ring->q_index); 309 + ring->q_index, ring->q_vector->napi.napi_id); 310 310 311 311 ring->xsk_pool = ice_xsk_pool(ring); 312 312 if (ring->xsk_pool) { ··· 333 333 /* coverity[check_return] */ 334 334 xdp_rxq_info_reg(&ring->xdp_rxq, 335 335 ring->netdev, 336 - ring->q_index); 336 + ring->q_index, ring->q_vector->napi.napi_id); 337 337 338 338 err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, 339 339 MEM_TYPE_PAGE_SHARED,

+1 -1

drivers/net/ethernet/intel/ice/ice_txrx.c

··· 483 483 if (rx_ring->vsi->type == ICE_VSI_PF && 484 484 !xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) 485 485 if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, 486 - rx_ring->q_index)) 486 + rx_ring->q_index, rx_ring->q_vector->napi.napi_id)) 487 487 goto err; 488 488 return 0; 489 489

+1 -1

drivers/net/ethernet/intel/igb/igb_main.c

··· 4352 4352 4353 4353 /* XDP RX-queue info */ 4354 4354 if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, 4355 - rx_ring->queue_index) < 0) 4355 + rx_ring->queue_index, 0) < 0) 4356 4356 goto err; 4357 4357 4358 4358 return 0;

+1 -1

drivers/net/ethernet/intel/ixgbe/ixgbe_main.c

··· 6577 6577 6578 6578 /* XDP RX-queue info */ 6579 6579 if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev, 6580 - rx_ring->queue_index) < 0) 6580 + rx_ring->queue_index, rx_ring->q_vector->napi.napi_id) < 0) 6581 6581 goto err; 6582 6582 6583 6583 rx_ring->xdp_prog = adapter->xdp_prog;

+1 -1

drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c

··· 3493 3493 3494 3494 /* XDP RX-queue info */ 3495 3495 if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev, 3496 - rx_ring->queue_index) < 0) 3496 + rx_ring->queue_index, 0) < 0) 3497 3497 goto err; 3498 3498 3499 3499 rx_ring->xdp_prog = adapter->xdp_prog;

+1 -1

drivers/net/ethernet/marvell/mvneta.c

··· 3243 3243 return err; 3244 3244 } 3245 3245 3246 - err = xdp_rxq_info_reg(&rxq->xdp_rxq, pp->dev, rxq->id); 3246 + err = xdp_rxq_info_reg(&rxq->xdp_rxq, pp->dev, rxq->id, 0); 3247 3247 if (err < 0) 3248 3248 goto err_free_pp; 3249 3249

+2 -2

drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

··· 2614 2614 mvpp2_rxq_status_update(port, rxq->id, 0, rxq->size); 2615 2615 2616 2616 if (priv->percpu_pools) { 2617 - err = xdp_rxq_info_reg(&rxq->xdp_rxq_short, port->dev, rxq->id); 2617 + err = xdp_rxq_info_reg(&rxq->xdp_rxq_short, port->dev, rxq->id, 0); 2618 2618 if (err < 0) 2619 2619 goto err_free_dma; 2620 2620 2621 - err = xdp_rxq_info_reg(&rxq->xdp_rxq_long, port->dev, rxq->id); 2621 + err = xdp_rxq_info_reg(&rxq->xdp_rxq_long, port->dev, rxq->id, 0); 2622 2622 if (err < 0) 2623 2623 goto err_unregister_rxq_short; 2624 2624

+1 -1

drivers/net/ethernet/mellanox/mlx4/en_rx.c

··· 283 283 ring->log_stride = ffs(ring->stride) - 1; 284 284 ring->buf_size = ring->size * ring->stride + TXBB_SIZE; 285 285 286 - if (xdp_rxq_info_reg(&ring->xdp_rxq, priv->dev, queue_index) < 0) 286 + if (xdp_rxq_info_reg(&ring->xdp_rxq, priv->dev, queue_index, 0) < 0) 287 287 goto err_ring; 288 288 289 289 tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS *

+1 -1

drivers/net/ethernet/mellanox/mlx5/core/en_main.c

··· 434 434 rq_xdp_ix = rq->ix; 435 435 if (xsk) 436 436 rq_xdp_ix += params->num_channels * MLX5E_RQ_GROUP_XSK; 437 - err = xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix); 437 + err = xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix, 0); 438 438 if (err < 0) 439 439 goto err_rq_xdp_prog; 440 440

+1 -1

drivers/net/ethernet/netronome/nfp/nfp_net_common.c

··· 2533 2533 2534 2534 if (dp->netdev) { 2535 2535 err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, dp->netdev, 2536 - rx_ring->idx); 2536 + rx_ring->idx, rx_ring->r_vec->napi.napi_id); 2537 2537 if (err < 0) 2538 2538 return err; 2539 2539 }

+1 -1

drivers/net/ethernet/qlogic/qede/qede_main.c

··· 1762 1762 1763 1763 /* Driver have no error path from here */ 1764 1764 WARN_ON(xdp_rxq_info_reg(&fp->rxq->xdp_rxq, edev->ndev, 1765 - fp->rxq->rxq_id) < 0); 1765 + fp->rxq->rxq_id, 0) < 0); 1766 1766 1767 1767 if (xdp_rxq_info_reg_mem_model(&fp->rxq->xdp_rxq, 1768 1768 MEM_TYPE_PAGE_ORDER0,

+1 -1

drivers/net/ethernet/sfc/rx_common.c

··· 262 262 263 263 /* Initialise XDP queue information */ 264 264 rc = xdp_rxq_info_reg(&rx_queue->xdp_rxq_info, efx->net_dev, 265 - rx_queue->core_index); 265 + rx_queue->core_index, 0); 266 266 267 267 if (rc) { 268 268 netif_err(efx, rx_err, efx->net_dev,

+1 -1

drivers/net/ethernet/socionext/netsec.c

··· 1314 1314 goto err_out; 1315 1315 } 1316 1316 1317 - err = xdp_rxq_info_reg(&dring->xdp_rxq, priv->ndev, 0); 1317 + err = xdp_rxq_info_reg(&dring->xdp_rxq, priv->ndev, 0, priv->napi.napi_id); 1318 1318 if (err) 1319 1319 goto err_out; 1320 1320

+1 -1

drivers/net/ethernet/ti/cpsw_priv.c

··· 1186 1186 pool = cpsw->page_pool[ch]; 1187 1187 rxq = &priv->xdp_rxq[ch]; 1188 1188 1189 - ret = xdp_rxq_info_reg(rxq, priv->ndev, ch); 1189 + ret = xdp_rxq_info_reg(rxq, priv->ndev, ch, 0); 1190 1190 if (ret) 1191 1191 return ret; 1192 1192

+1 -1

drivers/net/hyperv/netvsc.c

··· 1499 1499 u64_stats_init(&nvchan->tx_stats.syncp); 1500 1500 u64_stats_init(&nvchan->rx_stats.syncp); 1501 1501 1502 - ret = xdp_rxq_info_reg(&nvchan->xdp_rxq, ndev, i); 1502 + ret = xdp_rxq_info_reg(&nvchan->xdp_rxq, ndev, i, 0); 1503 1503 1504 1504 if (ret) { 1505 1505 netdev_err(ndev, "xdp_rxq_info_reg fail: %d\n", ret);

+1 -1

drivers/net/tun.c

··· 780 780 } else { 781 781 /* Setup XDP RX-queue info, for new tfile getting attached */ 782 782 err = xdp_rxq_info_reg(&tfile->xdp_rxq, 783 - tun->dev, tfile->queue_index); 783 + tun->dev, tfile->queue_index, 0); 784 784 if (err < 0) 785 785 goto out; 786 786 err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq,

+8 -4

drivers/net/veth.c

··· 884 884 for (i = 0; i < dev->real_num_rx_queues; i++) { 885 885 struct veth_rq *rq = &priv->rq[i]; 886 886 887 - netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 888 887 napi_enable(&rq->xdp_napi); 889 888 } 890 889 ··· 925 926 for (i = 0; i < dev->real_num_rx_queues; i++) { 926 927 struct veth_rq *rq = &priv->rq[i]; 927 928 928 - err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i); 929 + netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 930 + err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); 929 931 if (err < 0) 930 932 goto err_rxq_reg; 931 933 ··· 952 952 err_reg_mem: 953 953 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 954 954 err_rxq_reg: 955 - for (i--; i >= 0; i--) 956 - xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 955 + for (i--; i >= 0; i--) { 956 + struct veth_rq *rq = &priv->rq[i]; 957 + 958 + xdp_rxq_info_unreg(&rq->xdp_rxq); 959 + netif_napi_del(&rq->xdp_napi); 960 + } 957 961 958 962 return err; 959 963 }

+1 -1

drivers/net/virtio_net.c

··· 1485 1485 if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL)) 1486 1486 schedule_delayed_work(&vi->refill, 0); 1487 1487 1488 - err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i); 1488 + err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i, vi->rq[i].napi.napi_id); 1489 1489 if (err < 0) 1490 1490 return err; 1491 1491

+1 -1

drivers/net/xen-netfront.c

··· 2014 2014 } 2015 2015 2016 2016 err = xdp_rxq_info_reg(&queue->xdp_rxq, queue->info->netdev, 2017 - queue->id); 2017 + queue->id, 0); 2018 2018 if (err) { 2019 2019 netdev_err(queue->info->netdev, "xdp_rxq_info_reg failed\n"); 2020 2020 goto err_free_pp;

+1 -1

fs/buffer.c

··· 657 657 } while (bh != head); 658 658 } 659 659 /* 660 - * Lock out page->mem_cgroup migration to keep PageDirty 660 + * Lock out page's memcg migration to keep PageDirty 661 661 * synchronized with per-memcg dirty page counters. 662 662 */ 663 663 lock_page_memcg(page);

+2 -1

fs/eventpoll.c

··· 397 397 unsigned int napi_id = READ_ONCE(ep->napi_id); 398 398 399 399 if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) 400 - napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep); 400 + napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false, 401 + BUSY_POLL_BUDGET); 401 402 } 402 403 403 404 static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)

+1 -1

fs/iomap/buffered-io.c

··· 650 650 return !TestSetPageDirty(page); 651 651 652 652 /* 653 - * Lock out page->mem_cgroup migration to keep PageDirty 653 + * Lock out page's memcg migration to keep PageDirty 654 654 * synchronized with per-memcg dirty page counters. 655 655 */ 656 656 lock_page_memcg(page);

+6 -6

include/linux/bpf-cgroup.h

··· 246 246 __ret; \ 247 247 }) 248 248 249 - #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \ 250 - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND) 249 + #define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) \ 250 + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_BIND, NULL) 251 251 252 - #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \ 253 - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND) 252 + #define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) \ 253 + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_BIND, NULL) 254 254 255 255 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \ 256 256 sk->sk_prot->pre_connect) ··· 434 434 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) 435 435 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) 436 436 #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) 437 - #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) 438 - #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) 437 + #define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) ({ 0; }) 438 + #define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) ({ 0; }) 439 439 #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) 440 440 #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) 441 441 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; })

+44 -27

include/linux/bpf.h

··· 20 20 #include <linux/module.h> 21 21 #include <linux/kallsyms.h> 22 22 #include <linux/capability.h> 23 + #include <linux/sched/mm.h> 24 + #include <linux/slab.h> 23 25 24 26 struct bpf_verifier_env; 25 27 struct bpf_verifier_log; ··· 39 37 struct bpf_local_storage; 40 38 struct bpf_local_storage_map; 41 39 struct kobject; 40 + struct mem_cgroup; 42 41 43 42 extern struct idr btf_idr; 44 43 extern spinlock_t btf_idr_lock; ··· 138 135 const struct bpf_iter_seq_info *iter_seq_info; 139 136 }; 140 137 141 - struct bpf_map_memory { 142 - u32 pages; 143 - struct user_struct *user; 144 - }; 145 - 146 138 struct bpf_map { 147 139 /* The first two cachelines with read-mostly members of which some 148 140 * are also accessed in fast-path (e.g. ops, max_entries). ··· 158 160 u32 btf_key_type_id; 159 161 u32 btf_value_type_id; 160 162 struct btf *btf; 161 - struct bpf_map_memory memory; 163 + #ifdef CONFIG_MEMCG_KMEM 164 + struct mem_cgroup *memcg; 165 + #endif 162 166 char name[BPF_OBJ_NAME_LEN]; 163 167 u32 btf_vmlinux_value_type_id; 164 168 bool bypass_spec_v1; ··· 421 421 enum bpf_reg_type reg_type; 422 422 union { 423 423 int ctx_field_size; 424 - u32 btf_id; 424 + struct { 425 + struct btf *btf; 426 + u32 btf_id; 427 + }; 425 428 }; 426 429 struct bpf_verifier_log *log; /* for verbose logs */ 427 430 }; ··· 461 458 struct bpf_insn *dst, 462 459 struct bpf_prog *prog, u32 *target_size); 463 460 int (*btf_struct_access)(struct bpf_verifier_log *log, 461 + const struct btf *btf, 464 462 const struct btf_type *t, int off, int size, 465 463 enum bpf_access_type atype, 466 464 u32 *next_btf_id); ··· 775 771 u32 ctx_arg_info_size; 776 772 u32 max_rdonly_access; 777 773 u32 max_rdwr_access; 774 + struct btf *attach_btf; 778 775 const struct bpf_ctx_arg_aux *ctx_arg_info; 779 776 struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */ 780 777 struct bpf_prog *dst_prog; ··· 1010 1005 1011 1006 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); 1012 1007 int bpf_prog_calc_tag(struct bpf_prog *fp); 1013 - const char *kernel_type_name(u32 btf_type_id); 1014 1008 1015 1009 const struct bpf_func_proto *bpf_get_trace_printk_proto(void); 1016 1010 ··· 1206 1202 void bpf_prog_inc(struct bpf_prog *prog); 1207 1203 struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog); 1208 1204 void bpf_prog_put(struct bpf_prog *prog); 1209 - int __bpf_prog_charge(struct user_struct *user, u32 pages); 1210 - void __bpf_prog_uncharge(struct user_struct *user, u32 pages); 1211 1205 void __bpf_free_used_maps(struct bpf_prog_aux *aux, 1212 1206 struct bpf_map **used_maps, u32 len); 1213 1207 ··· 1220 1218 struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map); 1221 1219 void bpf_map_put_with_uref(struct bpf_map *map); 1222 1220 void bpf_map_put(struct bpf_map *map); 1223 - int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); 1224 - void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); 1225 - int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size); 1226 - void bpf_map_charge_finish(struct bpf_map_memory *mem); 1227 - void bpf_map_charge_move(struct bpf_map_memory *dst, 1228 - struct bpf_map_memory *src); 1229 1221 void *bpf_map_area_alloc(u64 size, int numa_node); 1230 1222 void *bpf_map_area_mmapable_alloc(u64 size, int numa_node); 1231 1223 void bpf_map_area_free(void *base); ··· 1235 1239 union bpf_attr __user *uattr); 1236 1240 struct bpf_map *bpf_map_get_curr_or_next(u32 *id); 1237 1241 struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); 1242 + 1243 + #ifdef CONFIG_MEMCG_KMEM 1244 + void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, 1245 + int node); 1246 + void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags); 1247 + void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, 1248 + size_t align, gfp_t flags); 1249 + #else 1250 + static inline void * 1251 + bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, 1252 + int node) 1253 + { 1254 + return kmalloc_node(size, flags, node); 1255 + } 1256 + 1257 + static inline void * 1258 + bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) 1259 + { 1260 + return kzalloc(size, flags); 1261 + } 1262 + 1263 + static inline void __percpu * 1264 + bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, 1265 + gfp_t flags) 1266 + { 1267 + return __alloc_percpu_gfp(size, align, flags); 1268 + } 1269 + #endif 1238 1270 1239 1271 extern int sysctl_unprivileged_bpf_disabled; 1240 1272 ··· 1454 1430 bool btf_ctx_access(int off, int size, enum bpf_access_type type, 1455 1431 const struct bpf_prog *prog, 1456 1432 struct bpf_insn_access_aux *info); 1457 - int btf_struct_access(struct bpf_verifier_log *log, 1433 + int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, 1458 1434 const struct btf_type *t, int off, int size, 1459 1435 enum bpf_access_type atype, 1460 1436 u32 *next_btf_id); 1461 1437 bool btf_struct_ids_match(struct bpf_verifier_log *log, 1462 - int off, u32 id, u32 need_type_id); 1438 + const struct btf *btf, u32 id, int off, 1439 + const struct btf *need_btf, u32 need_type_id); 1463 1440 1464 1441 int btf_distill_func_proto(struct bpf_verifier_log *log, 1465 1442 struct btf *btf, ··· 1513 1488 bpf_prog_inc_not_zero(struct bpf_prog *prog) 1514 1489 { 1515 1490 return ERR_PTR(-EOPNOTSUPP); 1516 - } 1517 - 1518 - static inline int __bpf_prog_charge(struct user_struct *user, u32 pages) 1519 - { 1520 - return 0; 1521 - } 1522 - 1523 - static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages) 1524 - { 1525 1491 } 1526 1492 1527 1493 static inline void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, ··· 1858 1842 extern const struct bpf_func_proto bpf_snprintf_btf_proto; 1859 1843 extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; 1860 1844 extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; 1845 + extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; 1861 1846 1862 1847 const struct bpf_func_proto *bpf_tracing_func_proto( 1863 1848 enum bpf_func_id func_id, const struct bpf_prog *prog);

+21 -7

include/linux/bpf_verifier.h

··· 5 5 #define _LINUX_BPF_VERIFIER_H 1 6 6 7 7 #include <linux/bpf.h> /* for enum bpf_reg_type */ 8 + #include <linux/btf.h> /* for struct btf and btf_id() */ 8 9 #include <linux/filter.h> /* for MAX_BPF_STACK */ 9 10 #include <linux/tnum.h> 10 11 ··· 44 43 struct bpf_reg_state { 45 44 /* Ordering of fields matters. See states_equal() */ 46 45 enum bpf_reg_type type; 46 + /* Fixed part of pointer offset, pointer types only */ 47 + s32 off; 47 48 union { 48 49 /* valid when type == PTR_TO_PACKET */ 49 50 int range; ··· 55 52 */ 56 53 struct bpf_map *map_ptr; 57 54 58 - u32 btf_id; /* for PTR_TO_BTF_ID */ 55 + /* for PTR_TO_BTF_ID */ 56 + struct { 57 + struct btf *btf; 58 + u32 btf_id; 59 + }; 59 60 60 61 u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */ 61 62 62 63 /* Max size from any of the above. */ 63 - unsigned long raw; 64 + struct { 65 + unsigned long raw1; 66 + unsigned long raw2; 67 + } raw; 64 68 }; 65 - /* Fixed part of pointer offset, pointer types only */ 66 - s32 off; 67 69 /* For PTR_TO_PACKET, used to find other pointers with the same variable 68 70 * offset, so they can share range knowledge. 69 71 * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we ··· 319 311 struct { 320 312 enum bpf_reg_type reg_type; /* type of pseudo_btf_id */ 321 313 union { 322 - u32 btf_id; /* btf_id for struct typed var */ 314 + struct { 315 + struct btf *btf; 316 + u32 btf_id; /* btf_id for struct typed var */ 317 + }; 323 318 u32 mem_size; /* mem_size for non-struct typed var */ 324 319 }; 325 320 } btf_var; ··· 470 459 471 460 /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ 472 461 static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, 473 - u32 btf_id) 462 + struct btf *btf, u32 btf_id) 474 463 { 475 - return tgt_prog ? (((u64)tgt_prog->aux->id) << 32 | btf_id) : btf_id; 464 + if (tgt_prog) 465 + return ((u64)tgt_prog->aux->id << 32) | btf_id; 466 + else 467 + return ((u64)btf_obj_id(btf) << 32) | 0x80000000 | btf_id; 476 468 } 477 469 478 470 int bpf_check_attach_target(struct bpf_verifier_log *log,

+5 -1

include/linux/btf.h

··· 18 18 19 19 extern const struct file_operations btf_fops; 20 20 21 + void btf_get(struct btf *btf); 21 22 void btf_put(struct btf *btf); 22 23 int btf_new_fd(const union bpf_attr *attr); 23 24 struct btf *btf_get_by_fd(int fd); ··· 89 88 char *buf, int len, u64 flags); 90 89 91 90 int btf_get_fd_by_id(u32 id); 92 - u32 btf_id(const struct btf *btf); 91 + u32 btf_obj_id(const struct btf *btf); 92 + bool btf_is_kernel(const struct btf *btf); 93 93 bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, 94 94 const struct btf_member *m, 95 95 u32 expected_offset, u32 expected_size); ··· 208 206 } 209 207 210 208 #ifdef CONFIG_BPF_SYSCALL 209 + struct bpf_prog; 210 + 211 211 const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); 212 212 const char *btf_name_by_offset(const struct btf *btf, u32 offset); 213 213 struct btf *btf_parse_vmlinux(void);

+6

include/linux/ima.h

··· 29 29 enum kernel_read_file_id id); 30 30 extern void ima_post_path_mknod(struct dentry *dentry); 31 31 extern int ima_file_hash(struct file *file, char *buf, size_t buf_size); 32 + extern int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size); 32 33 extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size); 33 34 34 35 #ifdef CONFIG_IMA_KEXEC ··· 112 111 } 113 112 114 113 static inline int ima_file_hash(struct file *file, char *buf, size_t buf_size) 114 + { 115 + return -EOPNOTSUPP; 116 + } 117 + 118 + static inline int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size) 115 119 { 116 120 return -EOPNOTSUPP; 117 121 }

+206 -9

include/linux/memcontrol.h

··· 343 343 344 344 extern struct mem_cgroup *root_mem_cgroup; 345 345 346 + enum page_memcg_data_flags { 347 + /* page->memcg_data is a pointer to an objcgs vector */ 348 + MEMCG_DATA_OBJCGS = (1UL << 0), 349 + /* page has been accounted as a non-slab kernel page */ 350 + MEMCG_DATA_KMEM = (1UL << 1), 351 + /* the next bit after the last actual flag */ 352 + __NR_MEMCG_DATA_FLAGS = (1UL << 2), 353 + }; 354 + 355 + #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1) 356 + 357 + /* 358 + * page_memcg - get the memory cgroup associated with a page 359 + * @page: a pointer to the page struct 360 + * 361 + * Returns a pointer to the memory cgroup associated with the page, 362 + * or NULL. This function assumes that the page is known to have a 363 + * proper memory cgroup pointer. It's not safe to call this function 364 + * against some type of pages, e.g. slab pages or ex-slab pages. 365 + * 366 + * Any of the following ensures page and memcg binding stability: 367 + * - the page lock 368 + * - LRU isolation 369 + * - lock_page_memcg() 370 + * - exclusive reference 371 + */ 372 + static inline struct mem_cgroup *page_memcg(struct page *page) 373 + { 374 + unsigned long memcg_data = page->memcg_data; 375 + 376 + VM_BUG_ON_PAGE(PageSlab(page), page); 377 + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page); 378 + 379 + return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); 380 + } 381 + 382 + /* 383 + * page_memcg_rcu - locklessly get the memory cgroup associated with a page 384 + * @page: a pointer to the page struct 385 + * 386 + * Returns a pointer to the memory cgroup associated with the page, 387 + * or NULL. This function assumes that the page is known to have a 388 + * proper memory cgroup pointer. It's not safe to call this function 389 + * against some type of pages, e.g. slab pages or ex-slab pages. 390 + */ 391 + static inline struct mem_cgroup *page_memcg_rcu(struct page *page) 392 + { 393 + VM_BUG_ON_PAGE(PageSlab(page), page); 394 + WARN_ON_ONCE(!rcu_read_lock_held()); 395 + 396 + return (struct mem_cgroup *)(READ_ONCE(page->memcg_data) & 397 + ~MEMCG_DATA_FLAGS_MASK); 398 + } 399 + 400 + /* 401 + * page_memcg_check - get the memory cgroup associated with a page 402 + * @page: a pointer to the page struct 403 + * 404 + * Returns a pointer to the memory cgroup associated with the page, 405 + * or NULL. This function unlike page_memcg() can take any page 406 + * as an argument. It has to be used in cases when it's not known if a page 407 + * has an associated memory cgroup pointer or an object cgroups vector. 408 + * 409 + * Any of the following ensures page and memcg binding stability: 410 + * - the page lock 411 + * - LRU isolation 412 + * - lock_page_memcg() 413 + * - exclusive reference 414 + */ 415 + static inline struct mem_cgroup *page_memcg_check(struct page *page) 416 + { 417 + /* 418 + * Because page->memcg_data might be changed asynchronously 419 + * for slab pages, READ_ONCE() should be used here. 420 + */ 421 + unsigned long memcg_data = READ_ONCE(page->memcg_data); 422 + 423 + if (memcg_data & MEMCG_DATA_OBJCGS) 424 + return NULL; 425 + 426 + return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); 427 + } 428 + 429 + /* 430 + * PageMemcgKmem - check if the page has MemcgKmem flag set 431 + * @page: a pointer to the page struct 432 + * 433 + * Checks if the page has MemcgKmem flag set. The caller must ensure that 434 + * the page has an associated memory cgroup. It's not safe to call this function 435 + * against some types of pages, e.g. slab pages. 436 + */ 437 + static inline bool PageMemcgKmem(struct page *page) 438 + { 439 + VM_BUG_ON_PAGE(page->memcg_data & MEMCG_DATA_OBJCGS, page); 440 + return page->memcg_data & MEMCG_DATA_KMEM; 441 + } 442 + 443 + #ifdef CONFIG_MEMCG_KMEM 444 + /* 445 + * page_objcgs - get the object cgroups vector associated with a page 446 + * @page: a pointer to the page struct 447 + * 448 + * Returns a pointer to the object cgroups vector associated with the page, 449 + * or NULL. This function assumes that the page is known to have an 450 + * associated object cgroups vector. It's not safe to call this function 451 + * against pages, which might have an associated memory cgroup: e.g. 452 + * kernel stack pages. 453 + */ 454 + static inline struct obj_cgroup **page_objcgs(struct page *page) 455 + { 456 + unsigned long memcg_data = READ_ONCE(page->memcg_data); 457 + 458 + VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), page); 459 + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page); 460 + 461 + return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); 462 + } 463 + 464 + /* 465 + * page_objcgs_check - get the object cgroups vector associated with a page 466 + * @page: a pointer to the page struct 467 + * 468 + * Returns a pointer to the object cgroups vector associated with the page, 469 + * or NULL. This function is safe to use if the page can be directly associated 470 + * with a memory cgroup. 471 + */ 472 + static inline struct obj_cgroup **page_objcgs_check(struct page *page) 473 + { 474 + unsigned long memcg_data = READ_ONCE(page->memcg_data); 475 + 476 + if (!memcg_data || !(memcg_data & MEMCG_DATA_OBJCGS)) 477 + return NULL; 478 + 479 + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page); 480 + 481 + return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); 482 + } 483 + 484 + /* 485 + * set_page_objcgs - associate a page with a object cgroups vector 486 + * @page: a pointer to the page struct 487 + * @objcgs: a pointer to the object cgroups vector 488 + * 489 + * Atomically associates a page with a vector of object cgroups. 490 + */ 491 + static inline bool set_page_objcgs(struct page *page, 492 + struct obj_cgroup **objcgs) 493 + { 494 + return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs | 495 + MEMCG_DATA_OBJCGS); 496 + } 497 + #else 498 + static inline struct obj_cgroup **page_objcgs(struct page *page) 499 + { 500 + return NULL; 501 + } 502 + 503 + static inline struct obj_cgroup **page_objcgs_check(struct page *page) 504 + { 505 + return NULL; 506 + } 507 + 508 + static inline bool set_page_objcgs(struct page *page, 509 + struct obj_cgroup **objcgs) 510 + { 511 + return true; 512 + } 513 + #endif 514 + 346 515 static __always_inline bool memcg_stat_item_in_bytes(int idx) 347 516 { 348 517 if (idx == MEMCG_PERCPU_B) ··· 912 743 static inline void __mod_memcg_page_state(struct page *page, 913 744 int idx, int val) 914 745 { 915 - if (page->mem_cgroup) 916 - __mod_memcg_state(page->mem_cgroup, idx, val); 746 + struct mem_cgroup *memcg = page_memcg(page); 747 + 748 + if (memcg) 749 + __mod_memcg_state(memcg, idx, val); 917 750 } 918 751 919 752 static inline void mod_memcg_page_state(struct page *page, 920 753 int idx, int val) 921 754 { 922 - if (page->mem_cgroup) 923 - mod_memcg_state(page->mem_cgroup, idx, val); 755 + struct mem_cgroup *memcg = page_memcg(page); 756 + 757 + if (memcg) 758 + mod_memcg_state(memcg, idx, val); 924 759 } 925 760 926 761 static inline unsigned long lruvec_page_state(struct lruvec *lruvec, ··· 1007 834 enum node_stat_item idx, int val) 1008 835 { 1009 836 struct page *head = compound_head(page); /* rmap on tail pages */ 837 + struct mem_cgroup *memcg = page_memcg(head); 1010 838 pg_data_t *pgdat = page_pgdat(page); 1011 839 struct lruvec *lruvec; 1012 840 1013 841 /* Untracked pages have no memcg, no lruvec. Update only the node */ 1014 - if (!head->mem_cgroup) { 842 + if (!memcg) { 1015 843 __mod_node_page_state(pgdat, idx, val); 1016 844 return; 1017 845 } 1018 846 1019 - lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat); 847 + lruvec = mem_cgroup_lruvec(memcg, pgdat); 1020 848 __mod_lruvec_state(lruvec, idx, val); 1021 849 } 1022 850 ··· 1052 878 static inline void count_memcg_page_event(struct page *page, 1053 879 enum vm_event_item idx) 1054 880 { 1055 - if (page->mem_cgroup) 1056 - count_memcg_events(page->mem_cgroup, idx, 1); 881 + struct mem_cgroup *memcg = page_memcg(page); 882 + 883 + if (memcg) 884 + count_memcg_events(memcg, idx, 1); 1057 885 } 1058 886 1059 887 static inline void count_memcg_event_mm(struct mm_struct *mm, ··· 1123 947 #define MEM_CGROUP_ID_MAX 0 1124 948 1125 949 struct mem_cgroup; 950 + 951 + static inline struct mem_cgroup *page_memcg(struct page *page) 952 + { 953 + return NULL; 954 + } 955 + 956 + static inline struct mem_cgroup *page_memcg_rcu(struct page *page) 957 + { 958 + WARN_ON_ONCE(!rcu_read_lock_held()); 959 + return NULL; 960 + } 961 + 962 + static inline struct mem_cgroup *page_memcg_check(struct page *page) 963 + { 964 + return NULL; 965 + } 966 + 967 + static inline bool PageMemcgKmem(struct page *page) 968 + { 969 + return false; 970 + } 1126 971 1127 972 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 1128 973 { ··· 1634 1437 if (mem_cgroup_disabled()) 1635 1438 return; 1636 1439 1637 - if (unlikely(&page->mem_cgroup->css != wb->memcg_css)) 1440 + if (unlikely(&page_memcg(page)->css != wb->memcg_css)) 1638 1441 mem_cgroup_track_foreign_dirty_slowpath(page, wb); 1639 1442 } 1640 1443

-22

include/linux/mm.h

··· 1484 1484 #endif 1485 1485 } 1486 1486 1487 - #ifdef CONFIG_MEMCG 1488 - static inline struct mem_cgroup *page_memcg(struct page *page) 1489 - { 1490 - return page->mem_cgroup; 1491 - } 1492 - static inline struct mem_cgroup *page_memcg_rcu(struct page *page) 1493 - { 1494 - WARN_ON_ONCE(!rcu_read_lock_held()); 1495 - return READ_ONCE(page->mem_cgroup); 1496 - } 1497 - #else 1498 - static inline struct mem_cgroup *page_memcg(struct page *page) 1499 - { 1500 - return NULL; 1501 - } 1502 - static inline struct mem_cgroup *page_memcg_rcu(struct page *page) 1503 - { 1504 - WARN_ON_ONCE(!rcu_read_lock_held()); 1505 - return NULL; 1506 - } 1507 - #endif 1508 - 1509 1487 /* 1510 1488 * Some inline functions in vmstat.h depend on page_zone() 1511 1489 */

+1 -4

include/linux/mm_types.h

··· 199 199 atomic_t _refcount; 200 200 201 201 #ifdef CONFIG_MEMCG 202 - union { 203 - struct mem_cgroup *mem_cgroup; 204 - struct obj_cgroup **obj_cgroups; 205 - }; 202 + unsigned long memcg_data; 206 203 #endif 207 204 208 205 /*

+21 -14

include/linux/netdevice.h

··· 350 350 }; 351 351 352 352 enum { 353 - NAPI_STATE_SCHED, /* Poll is scheduled */ 354 - NAPI_STATE_MISSED, /* reschedule a napi */ 355 - NAPI_STATE_DISABLE, /* Disable pending */ 356 - NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ 357 - NAPI_STATE_LISTED, /* NAPI added to system lists */ 358 - NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ 359 - NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */ 353 + NAPI_STATE_SCHED, /* Poll is scheduled */ 354 + NAPI_STATE_MISSED, /* reschedule a napi */ 355 + NAPI_STATE_DISABLE, /* Disable pending */ 356 + NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ 357 + NAPI_STATE_LISTED, /* NAPI added to system lists */ 358 + NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */ 359 + NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */ 360 + NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/ 360 361 }; 361 362 362 363 enum { 363 - NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED), 364 - NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), 365 - NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), 366 - NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), 367 - NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED), 368 - NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), 369 - NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), 364 + NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED), 365 + NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), 366 + NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), 367 + NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), 368 + NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED), 369 + NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), 370 + NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), 371 + NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL), 370 372 }; 371 373 372 374 enum gro_result { ··· 437 435 static inline bool napi_disable_pending(struct napi_struct *n) 438 436 { 439 437 return test_bit(NAPI_STATE_DISABLE, &n->state); 438 + } 439 + 440 + static inline bool napi_prefer_busy_poll(struct napi_struct *n) 441 + { 442 + return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); 440 443 } 441 444 442 445 bool napi_schedule_prep(struct napi_struct *n);

+2 -9

include/linux/page-flags.h

··· 715 715 #define PAGE_MAPCOUNT_RESERVE -128 716 716 #define PG_buddy 0x00000080 717 717 #define PG_offline 0x00000100 718 - #define PG_kmemcg 0x00000200 719 - #define PG_table 0x00000400 720 - #define PG_guard 0x00000800 718 + #define PG_table 0x00000200 719 + #define PG_guard 0x00000400 721 720 722 721 #define PageType(page, flag) \ 723 722 ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) ··· 766 767 * buddy via online_page_callback_t. 767 768 */ 768 769 PAGE_TYPE_OPS(Offline, offline) 769 - 770 - /* 771 - * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on 772 - * pages allocated with __GFP_ACCOUNT. It gets cleared on page free. 773 - */ 774 - PAGE_TYPE_OPS(Kmemcg, kmemcg) 775 770 776 771 /* 777 772 * Marks pages in use as page tables.

+23 -4

include/net/busy_poll.h

··· 23 23 */ 24 24 #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) 25 25 26 + #define BUSY_POLL_BUDGET 8 27 + 26 28 #ifdef CONFIG_NET_RX_BUSY_POLL 27 29 28 30 struct napi_struct; ··· 45 43 46 44 void napi_busy_loop(unsigned int napi_id, 47 45 bool (*loop_end)(void *, unsigned long), 48 - void *loop_end_arg); 46 + void *loop_end_arg, bool prefer_busy_poll, u16 budget); 49 47 50 48 #else /* CONFIG_NET_RX_BUSY_POLL */ 51 49 static inline unsigned long net_busy_loop_on(void) ··· 107 105 unsigned int napi_id = READ_ONCE(sk->sk_napi_id); 108 106 109 107 if (napi_id >= MIN_NAPI_ID) 110 - napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk); 108 + napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, 109 + READ_ONCE(sk->sk_prefer_busy_poll), 110 + READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET); 111 111 #endif 112 112 } 113 113 ··· 135 131 sk_rx_queue_set(sk, skb); 136 132 } 137 133 134 + static inline void __sk_mark_napi_id_once(struct sock *sk, unsigned int napi_id) 135 + { 136 + #ifdef CONFIG_NET_RX_BUSY_POLL 137 + if (!READ_ONCE(sk->sk_napi_id)) 138 + WRITE_ONCE(sk->sk_napi_id, napi_id); 139 + #endif 140 + } 141 + 138 142 /* variant used for unconnected sockets */ 139 143 static inline void sk_mark_napi_id_once(struct sock *sk, 140 144 const struct sk_buff *skb) 141 145 { 142 146 #ifdef CONFIG_NET_RX_BUSY_POLL 143 - if (!READ_ONCE(sk->sk_napi_id)) 144 - WRITE_ONCE(sk->sk_napi_id, skb->napi_id); 147 + __sk_mark_napi_id_once(sk, skb->napi_id); 148 + #endif 149 + } 150 + 151 + static inline void sk_mark_napi_id_once_xdp(struct sock *sk, 152 + const struct xdp_buff *xdp) 153 + { 154 + #ifdef CONFIG_NET_RX_BUSY_POLL 155 + __sk_mark_napi_id_once(sk, xdp->rxq->napi_id); 145 156 #endif 146 157 } 147 158

+6

include/net/sock.h

··· 301 301 * @sk_ack_backlog: current listen backlog 302 302 * @sk_max_ack_backlog: listen backlog set in listen() 303 303 * @sk_uid: user id of owner 304 + * @sk_prefer_busy_poll: prefer busypolling over softirq processing 305 + * @sk_busy_poll_budget: napi processing budget when busypolling 304 306 * @sk_priority: %SO_PRIORITY setting 305 307 * @sk_type: socket type (%SOCK_STREAM, etc) 306 308 * @sk_protocol: which protocol this socket belongs in this network family ··· 481 479 u32 sk_ack_backlog; 482 480 u32 sk_max_ack_backlog; 483 481 kuid_t sk_uid; 482 + #ifdef CONFIG_NET_RX_BUSY_POLL 483 + u8 sk_prefer_busy_poll; 484 + u16 sk_busy_poll_budget; 485 + #endif 484 486 struct pid *sk_peer_pid; 485 487 const struct cred *sk_peer_cred; 486 488 long sk_rcvtimeo;

+1

include/net/tcp.h

··· 410 410 int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, 411 411 int flags, int *addr_len); 412 412 int tcp_set_rcvlowat(struct sock *sk, int val); 413 + int tcp_set_window_clamp(struct sock *sk, int val); 413 414 void tcp_data_ready(struct sock *sk); 414 415 #ifdef CONFIG_MMU 415 416 int tcp_mmap(struct file *file, struct socket *sock,

+2 -1

include/net/xdp.h

··· 59 59 u32 queue_index; 60 60 u32 reg_state; 61 61 struct xdp_mem_info mem; 62 + unsigned int napi_id; 62 63 } ____cacheline_aligned; /* perf critical, avoid false-sharing */ 63 64 64 65 struct xdp_txq_info { ··· 227 226 } 228 227 229 228 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, 230 - struct net_device *dev, u32 queue_index); 229 + struct net_device *dev, u32 queue_index, unsigned int napi_id); 231 230 void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); 232 231 void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); 233 232 bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);

+7

include/net/xdp_sock_drv.h

··· 13 13 14 14 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); 15 15 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); 16 + u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc, u32 max); 16 17 void xsk_tx_release(struct xsk_buff_pool *pool); 17 18 struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, 18 19 u16 queue_id); ··· 127 126 struct xdp_desc *desc) 128 127 { 129 128 return false; 129 + } 130 + 131 + static inline u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc, 132 + u32 max) 133 + { 134 + return 0; 130 135 } 131 136 132 137 static inline void xsk_tx_release(struct xsk_buff_pool *pool)

+1 -1

include/trace/events/writeback.h

··· 257 257 __entry->ino = inode ? inode->i_ino : 0; 258 258 __entry->memcg_id = wb->memcg_css->id; 259 259 __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); 260 - __entry->page_cgroup_ino = cgroup_ino(page->mem_cgroup->css.cgroup); 260 + __entry->page_cgroup_ino = cgroup_ino(page_memcg(page)->css.cgroup); 261 261 ), 262 262 263 263 TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu",

+3

include/uapi/asm-generic/socket.h

··· 119 119 120 120 #define SO_DETACH_REUSEPORT_BPF 68 121 121 122 + #define SO_PREFER_BUSY_POLL 69 123 + #define SO_BUSY_POLL_BUDGET 70 124 + 122 125 #if !defined(__KERNEL__) 123 126 124 127 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))

+44 -1

include/uapi/linux/bpf.h

··· 557 557 __aligned_u64 line_info; /* line info */ 558 558 __u32 line_info_cnt; /* number of bpf_line_info records */ 559 559 __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ 560 - __u32 attach_prog_fd; /* 0 to attach to vmlinux */ 560 + union { 561 + /* valid prog_fd to attach to bpf prog */ 562 + __u32 attach_prog_fd; 563 + /* or valid module BTF object fd or 0 to attach to vmlinux */ 564 + __u32 attach_btf_obj_fd; 565 + }; 561 566 }; 562 567 563 568 struct { /* anonymous struct used by BPF_OBJ_* commands */ ··· 3792 3787 * *ARG_PTR_TO_BTF_ID* of type *task_struct*. 3793 3788 * Return 3794 3789 * Pointer to the current task. 3790 + * 3791 + * long bpf_bprm_opts_set(struct linux_binprm *bprm, u64 flags) 3792 + * Description 3793 + * Set or clear certain options on *bprm*: 3794 + * 3795 + * **BPF_F_BPRM_SECUREEXEC** Set the secureexec bit 3796 + * which sets the **AT_SECURE** auxv for glibc. The bit 3797 + * is cleared if the flag is not specified. 3798 + * Return 3799 + * **-EINVAL** if invalid *flags* are passed, zero otherwise. 3800 + * 3801 + * u64 bpf_ktime_get_coarse_ns(void) 3802 + * Description 3803 + * Return a coarse-grained version of the time elapsed since 3804 + * system boot, in nanoseconds. Does not include time the system 3805 + * was suspended. 3806 + * 3807 + * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**) 3808 + * Return 3809 + * Current *ktime*. 3810 + * 3811 + * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size) 3812 + * Description 3813 + * Returns the stored IMA hash of the *inode* (if it's avaialable). 3814 + * If the hash is larger than *size*, then only *size* 3815 + * bytes will be copied to *dst* 3816 + * Return 3817 + * The **hash_algo** is returned on success, 3818 + * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if 3819 + * invalid arguments are passed. 3795 3820 */ 3796 3821 #define __BPF_FUNC_MAPPER(FN) \ 3797 3822 FN(unspec), \ ··· 3983 3948 FN(task_storage_get), \ 3984 3949 FN(task_storage_delete), \ 3985 3950 FN(get_current_task_btf), \ 3951 + FN(bprm_opts_set), \ 3952 + FN(ktime_get_coarse_ns), \ 3953 + FN(ima_inode_hash), \ 3986 3954 /* */ 3987 3955 3988 3956 /* integer value in 'imm' field of BPF_CALL instruction selects which helper ··· 4155 4117 BPF_LWT_ENCAP_SEG6, 4156 4118 BPF_LWT_ENCAP_SEG6_INLINE, 4157 4119 BPF_LWT_ENCAP_IP, 4120 + }; 4121 + 4122 + /* Flags for bpf_bprm_opts_set helper */ 4123 + enum { 4124 + BPF_F_BPRM_SECUREEXEC = (1ULL << 0), 4158 4125 }; 4159 4126 4160 4127 #define __bpf_md_ptr(type, name) \

+7 -23

kernel/bpf/arraymap.c

··· 34 34 int i; 35 35 36 36 for (i = 0; i < array->map.max_entries; i++) { 37 - ptr = __alloc_percpu_gfp(array->elem_size, 8, 38 - GFP_USER | __GFP_NOWARN); 37 + ptr = bpf_map_alloc_percpu(&array->map, array->elem_size, 8, 38 + GFP_USER | __GFP_NOWARN); 39 39 if (!ptr) { 40 40 bpf_array_free_percpu(array); 41 41 return -ENOMEM; ··· 81 81 static struct bpf_map *array_map_alloc(union bpf_attr *attr) 82 82 { 83 83 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; 84 - int ret, numa_node = bpf_map_attr_numa_node(attr); 84 + int numa_node = bpf_map_attr_numa_node(attr); 85 85 u32 elem_size, index_mask, max_entries; 86 86 bool bypass_spec_v1 = bpf_bypass_spec_v1(); 87 - u64 cost, array_size, mask64; 88 - struct bpf_map_memory mem; 87 + u64 array_size, mask64; 89 88 struct bpf_array *array; 90 89 91 90 elem_size = round_up(attr->value_size, 8); ··· 125 126 } 126 127 } 127 128 128 - /* make sure there is no u32 overflow later in round_up() */ 129 - cost = array_size; 130 - if (percpu) 131 - cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); 132 - 133 - ret = bpf_map_charge_init(&mem, cost); 134 - if (ret < 0) 135 - return ERR_PTR(ret); 136 - 137 129 /* allocate all map elements and zero-initialize them */ 138 130 if (attr->map_flags & BPF_F_MMAPABLE) { 139 131 void *data; 140 132 141 133 /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */ 142 134 data = bpf_map_area_mmapable_alloc(array_size, numa_node); 143 - if (!data) { 144 - bpf_map_charge_finish(&mem); 135 + if (!data) 145 136 return ERR_PTR(-ENOMEM); 146 - } 147 137 array = data + PAGE_ALIGN(sizeof(struct bpf_array)) 148 138 - offsetof(struct bpf_array, value); 149 139 } else { 150 140 array = bpf_map_area_alloc(array_size, numa_node); 151 141 } 152 - if (!array) { 153 - bpf_map_charge_finish(&mem); 142 + if (!array) 154 143 return ERR_PTR(-ENOMEM); 155 - } 156 144 array->index_mask = index_mask; 157 145 array->map.bypass_spec_v1 = bypass_spec_v1; 158 146 159 147 /* copy mandatory map attributes */ 160 148 bpf_map_init_from_attr(&array->map, attr); 161 - bpf_map_charge_move(&array->map.memory, &mem); 162 149 array->elem_size = elem_size; 163 150 164 151 if (percpu && bpf_array_alloc_percpu(array)) { 165 - bpf_map_charge_finish(&array->map.memory); 166 152 bpf_map_area_free(array); 167 153 return ERR_PTR(-ENOMEM); 168 154 } ··· 1002 1018 struct bpf_array_aux *aux; 1003 1019 struct bpf_map *map; 1004 1020 1005 - aux = kzalloc(sizeof(*aux), GFP_KERNEL); 1021 + aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT); 1006 1022 if (!aux) 1007 1023 return ERR_PTR(-ENOMEM); 1008 1024

+6 -14

kernel/bpf/bpf_local_storage.c

··· 67 67 if (charge_mem && mem_charge(smap, owner, smap->elem_size)) 68 68 return NULL; 69 69 70 - selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN); 70 + selem = bpf_map_kzalloc(&smap->map, smap->elem_size, 71 + GFP_ATOMIC | __GFP_NOWARN); 71 72 if (selem) { 72 73 if (value) 73 74 memcpy(SDATA(selem)->data, value, smap->map.value_size); ··· 265 264 if (err) 266 265 return err; 267 266 268 - storage = kzalloc(sizeof(*storage), GFP_ATOMIC | __GFP_NOWARN); 267 + storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), 268 + GFP_ATOMIC | __GFP_NOWARN); 269 269 if (!storage) { 270 270 err = -ENOMEM; 271 271 goto uncharge; ··· 545 543 struct bpf_local_storage_map *smap; 546 544 unsigned int i; 547 545 u32 nbuckets; 548 - u64 cost; 549 - int ret; 550 546 551 - smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN); 547 + smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); 552 548 if (!smap) 553 549 return ERR_PTR(-ENOMEM); 554 550 bpf_map_init_from_attr(&smap->map, attr); ··· 555 555 /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ 556 556 nbuckets = max_t(u32, 2, nbuckets); 557 557 smap->bucket_log = ilog2(nbuckets); 558 - cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); 559 - 560 - ret = bpf_map_charge_init(&smap->map.memory, cost); 561 - if (ret < 0) { 562 - kfree(smap); 563 - return ERR_PTR(ret); 564 - } 565 558 566 559 smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, 567 - GFP_USER | __GFP_NOWARN); 560 + GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); 568 561 if (!smap->buckets) { 569 - bpf_map_charge_finish(&smap->map.memory); 570 562 kfree(smap); 571 563 return ERR_PTR(-ENOMEM); 572 564 }

+52

kernel/bpf/bpf_lsm.c

··· 7 7 #include <linux/filter.h> 8 8 #include <linux/bpf.h> 9 9 #include <linux/btf.h> 10 + #include <linux/binfmts.h> 10 11 #include <linux/lsm_hooks.h> 11 12 #include <linux/bpf_lsm.h> 12 13 #include <linux/kallsyms.h> ··· 15 14 #include <net/bpf_sk_storage.h> 16 15 #include <linux/bpf_local_storage.h> 17 16 #include <linux/btf_ids.h> 17 + #include <linux/ima.h> 18 18 19 19 /* For every LSM hook that allows attachment of BPF programs, declare a nop 20 20 * function where a BPF program can be attached. ··· 53 51 return 0; 54 52 } 55 53 54 + /* Mask for all the currently supported BPRM option flags */ 55 + #define BPF_F_BRPM_OPTS_MASK BPF_F_BPRM_SECUREEXEC 56 + 57 + BPF_CALL_2(bpf_bprm_opts_set, struct linux_binprm *, bprm, u64, flags) 58 + { 59 + if (flags & ~BPF_F_BRPM_OPTS_MASK) 60 + return -EINVAL; 61 + 62 + bprm->secureexec = (flags & BPF_F_BPRM_SECUREEXEC); 63 + return 0; 64 + } 65 + 66 + BTF_ID_LIST_SINGLE(bpf_bprm_opts_set_btf_ids, struct, linux_binprm) 67 + 68 + const static struct bpf_func_proto bpf_bprm_opts_set_proto = { 69 + .func = bpf_bprm_opts_set, 70 + .gpl_only = false, 71 + .ret_type = RET_INTEGER, 72 + .arg1_type = ARG_PTR_TO_BTF_ID, 73 + .arg1_btf_id = &bpf_bprm_opts_set_btf_ids[0], 74 + .arg2_type = ARG_ANYTHING, 75 + }; 76 + 77 + BPF_CALL_3(bpf_ima_inode_hash, struct inode *, inode, void *, dst, u32, size) 78 + { 79 + return ima_inode_hash(inode, dst, size); 80 + } 81 + 82 + static bool bpf_ima_inode_hash_allowed(const struct bpf_prog *prog) 83 + { 84 + return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id); 85 + } 86 + 87 + BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode) 88 + 89 + const static struct bpf_func_proto bpf_ima_inode_hash_proto = { 90 + .func = bpf_ima_inode_hash, 91 + .gpl_only = false, 92 + .ret_type = RET_INTEGER, 93 + .arg1_type = ARG_PTR_TO_BTF_ID, 94 + .arg1_btf_id = &bpf_ima_inode_hash_btf_ids[0], 95 + .arg2_type = ARG_PTR_TO_UNINIT_MEM, 96 + .arg3_type = ARG_CONST_SIZE, 97 + .allowed = bpf_ima_inode_hash_allowed, 98 + }; 99 + 56 100 static const struct bpf_func_proto * 57 101 bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 58 102 { ··· 119 71 return &bpf_task_storage_get_proto; 120 72 case BPF_FUNC_task_storage_delete: 121 73 return &bpf_task_storage_delete_proto; 74 + case BPF_FUNC_bprm_opts_set: 75 + return &bpf_bprm_opts_set_proto; 76 + case BPF_FUNC_ima_inode_hash: 77 + return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL; 122 78 default: 123 79 return tracing_prog_func_proto(func_id, prog); 124 80 }

+3 -16

kernel/bpf/bpf_struct_ops.c

··· 548 548 static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) 549 549 { 550 550 const struct bpf_struct_ops *st_ops; 551 - size_t map_total_size, st_map_size; 551 + size_t st_map_size; 552 552 struct bpf_struct_ops_map *st_map; 553 553 const struct btf_type *t, *vt; 554 - struct bpf_map_memory mem; 555 554 struct bpf_map *map; 556 - int err; 557 555 558 556 if (!bpf_capable()) 559 557 return ERR_PTR(-EPERM); ··· 571 573 * struct bpf_struct_ops_tcp_congestions_ops 572 574 */ 573 575 (vt->size - sizeof(struct bpf_struct_ops_value)); 574 - map_total_size = st_map_size + 575 - /* uvalue */ 576 - sizeof(vt->size) + 577 - /* struct bpf_progs **progs */ 578 - btf_type_vlen(t) * sizeof(struct bpf_prog *); 579 - err = bpf_map_charge_init(&mem, map_total_size); 580 - if (err < 0) 581 - return ERR_PTR(err); 582 576 583 577 st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); 584 - if (!st_map) { 585 - bpf_map_charge_finish(&mem); 578 + if (!st_map) 586 579 return ERR_PTR(-ENOMEM); 587 - } 580 + 588 581 st_map->st_ops = st_ops; 589 582 map = &st_map->map; 590 583 ··· 586 597 st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); 587 598 if (!st_map->uvalue || !st_map->progs || !st_map->image) { 588 599 bpf_struct_ops_map_free(map); 589 - bpf_map_charge_finish(&mem); 590 600 return ERR_PTR(-ENOMEM); 591 601 } 592 602 593 603 mutex_init(&st_map->lock); 594 604 set_vm_flush_reset_perms(st_map->image); 595 605 bpf_map_init_from_attr(map, attr); 596 - bpf_map_charge_move(&map->memory, &mem); 597 606 598 607 return map; 599 608 }

+49 -21

kernel/bpf/btf.c

··· 1524 1524 btf_free(btf); 1525 1525 } 1526 1526 1527 + void btf_get(struct btf *btf) 1528 + { 1529 + refcount_inc(&btf->refcnt); 1530 + } 1531 + 1527 1532 void btf_put(struct btf *btf) 1528 1533 { 1529 1534 if (btf && refcount_dec_and_test(&btf->refcnt)) { ··· 4560 4555 { 4561 4556 struct bpf_prog *tgt_prog = prog->aux->dst_prog; 4562 4557 4563 - if (tgt_prog) { 4558 + if (tgt_prog) 4564 4559 return tgt_prog->aux->btf; 4565 - } else { 4566 - return btf_vmlinux; 4567 - } 4560 + else 4561 + return prog->aux->attach_btf; 4568 4562 } 4569 4563 4570 4564 static bool is_string_ptr(struct btf *btf, const struct btf_type *t) ··· 4704 4700 4705 4701 if (ctx_arg_info->offset == off) { 4706 4702 info->reg_type = ctx_arg_info->reg_type; 4703 + info->btf = btf_vmlinux; 4707 4704 info->btf_id = ctx_arg_info->btf_id; 4708 4705 return true; 4709 4706 } ··· 4721 4716 4722 4717 ret = btf_translate_to_vmlinux(log, btf, t, tgt_type, arg); 4723 4718 if (ret > 0) { 4719 + info->btf = btf_vmlinux; 4724 4720 info->btf_id = ret; 4725 4721 return true; 4726 4722 } else { ··· 4729 4723 } 4730 4724 } 4731 4725 4726 + info->btf = btf; 4732 4727 info->btf_id = t->type; 4733 4728 t = btf_type_by_id(btf, t->type); 4734 4729 /* skip modifiers */ ··· 4756 4749 WALK_STRUCT, 4757 4750 }; 4758 4751 4759 - static int btf_struct_walk(struct bpf_verifier_log *log, 4752 + static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, 4760 4753 const struct btf_type *t, int off, int size, 4761 4754 u32 *next_btf_id) 4762 4755 { ··· 4767 4760 u32 vlen, elem_id, mid; 4768 4761 4769 4762 again: 4770 - tname = __btf_name_by_offset(btf_vmlinux, t->name_off); 4763 + tname = __btf_name_by_offset(btf, t->name_off); 4771 4764 if (!btf_type_is_struct(t)) { 4772 4765 bpf_log(log, "Type '%s' is not a struct\n", tname); 4773 4766 return -EINVAL; ··· 4784 4777 goto error; 4785 4778 4786 4779 member = btf_type_member(t) + vlen - 1; 4787 - mtype = btf_type_skip_modifiers(btf_vmlinux, member->type, 4780 + mtype = btf_type_skip_modifiers(btf, member->type, 4788 4781 NULL); 4789 4782 if (!btf_type_is_array(mtype)) 4790 4783 goto error; ··· 4800 4793 /* Only allow structure for now, can be relaxed for 4801 4794 * other types later. 4802 4795 */ 4803 - t = btf_type_skip_modifiers(btf_vmlinux, array_elem->type, 4796 + t = btf_type_skip_modifiers(btf, array_elem->type, 4804 4797 NULL); 4805 4798 if (!btf_type_is_struct(t)) 4806 4799 goto error; ··· 4858 4851 4859 4852 /* type of the field */ 4860 4853 mid = member->type; 4861 - mtype = btf_type_by_id(btf_vmlinux, member->type); 4862 - mname = __btf_name_by_offset(btf_vmlinux, member->name_off); 4854 + mtype = btf_type_by_id(btf, member->type); 4855 + mname = __btf_name_by_offset(btf, member->name_off); 4863 4856 4864 - mtype = __btf_resolve_size(btf_vmlinux, mtype, &msize, 4857 + mtype = __btf_resolve_size(btf, mtype, &msize, 4865 4858 &elem_type, &elem_id, &total_nelems, 4866 4859 &mid); 4867 4860 if (IS_ERR(mtype)) { ··· 4956 4949 mname, moff, tname, off, size); 4957 4950 return -EACCES; 4958 4951 } 4959 - stype = btf_type_skip_modifiers(btf_vmlinux, mtype->type, &id); 4952 + stype = btf_type_skip_modifiers(btf, mtype->type, &id); 4960 4953 if (btf_type_is_struct(stype)) { 4961 4954 *next_btf_id = id; 4962 4955 return WALK_PTR; ··· 4982 4975 return -EINVAL; 4983 4976 } 4984 4977 4985 - int btf_struct_access(struct bpf_verifier_log *log, 4978 + int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, 4986 4979 const struct btf_type *t, int off, int size, 4987 4980 enum bpf_access_type atype __maybe_unused, 4988 4981 u32 *next_btf_id) ··· 4991 4984 u32 id; 4992 4985 4993 4986 do { 4994 - err = btf_struct_walk(log, t, off, size, &id); 4987 + err = btf_struct_walk(log, btf, t, off, size, &id); 4995 4988 4996 4989 switch (err) { 4997 4990 case WALK_PTR: ··· 5007 5000 * by diving in it. At this point the offset is 5008 5001 * aligned with the new type, so set it to 0. 5009 5002 */ 5010 - t = btf_type_by_id(btf_vmlinux, id); 5003 + t = btf_type_by_id(btf, id); 5011 5004 off = 0; 5012 5005 break; 5013 5006 default: ··· 5023 5016 return -EINVAL; 5024 5017 } 5025 5018 5019 + /* Check that two BTF types, each specified as an BTF object + id, are exactly 5020 + * the same. Trivial ID check is not enough due to module BTFs, because we can 5021 + * end up with two different module BTFs, but IDs point to the common type in 5022 + * vmlinux BTF. 5023 + */ 5024 + static bool btf_types_are_same(const struct btf *btf1, u32 id1, 5025 + const struct btf *btf2, u32 id2) 5026 + { 5027 + if (id1 != id2) 5028 + return false; 5029 + if (btf1 == btf2) 5030 + return true; 5031 + return btf_type_by_id(btf1, id1) == btf_type_by_id(btf2, id2); 5032 + } 5033 + 5026 5034 bool btf_struct_ids_match(struct bpf_verifier_log *log, 5027 - int off, u32 id, u32 need_type_id) 5035 + const struct btf *btf, u32 id, int off, 5036 + const struct btf *need_btf, u32 need_type_id) 5028 5037 { 5029 5038 const struct btf_type *type; 5030 5039 int err; 5031 5040 5032 5041 /* Are we already done? */ 5033 - if (need_type_id == id && off == 0) 5042 + if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id)) 5034 5043 return true; 5035 5044 5036 5045 again: 5037 - type = btf_type_by_id(btf_vmlinux, id); 5046 + type = btf_type_by_id(btf, id); 5038 5047 if (!type) 5039 5048 return false; 5040 - err = btf_struct_walk(log, type, off, 1, &id); 5049 + err = btf_struct_walk(log, btf, type, off, 1, &id); 5041 5050 if (err != WALK_STRUCT) 5042 5051 return false; 5043 5052 ··· 5062 5039 * continue the search with offset 0 in the new 5063 5040 * type. 5064 5041 */ 5065 - if (need_type_id != id) { 5042 + if (!btf_types_are_same(btf, id, need_btf, need_type_id)) { 5066 5043 off = 0; 5067 5044 goto again; 5068 5045 } ··· 5733 5710 return fd; 5734 5711 } 5735 5712 5736 - u32 btf_id(const struct btf *btf) 5713 + u32 btf_obj_id(const struct btf *btf) 5737 5714 { 5738 5715 return btf->id; 5716 + } 5717 + 5718 + bool btf_is_kernel(const struct btf *btf) 5719 + { 5720 + return btf->kernel_btf; 5739 5721 } 5740 5722 5741 5723 static int btf_id_cmp_func(const void *a, const void *b)

+8 -15

kernel/bpf/core.c

··· 77 77 78 78 struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags) 79 79 { 80 - gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; 80 + gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; 81 81 struct bpf_prog_aux *aux; 82 82 struct bpf_prog *fp; 83 83 ··· 86 86 if (fp == NULL) 87 87 return NULL; 88 88 89 - aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); 89 + aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT | gfp_extra_flags); 90 90 if (aux == NULL) { 91 91 vfree(fp); 92 92 return NULL; ··· 106 106 107 107 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) 108 108 { 109 - gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; 109 + gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; 110 110 struct bpf_prog *prog; 111 111 int cpu; 112 112 ··· 138 138 139 139 prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo, 140 140 sizeof(*prog->aux->jited_linfo), 141 - GFP_KERNEL | __GFP_NOWARN); 141 + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 142 142 if (!prog->aux->jited_linfo) 143 143 return -ENOMEM; 144 144 ··· 219 219 struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, 220 220 gfp_t gfp_extra_flags) 221 221 { 222 - gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; 222 + gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; 223 223 struct bpf_prog *fp; 224 - u32 pages, delta; 225 - int ret; 224 + u32 pages; 226 225 227 226 size = round_up(size, PAGE_SIZE); 228 227 pages = size / PAGE_SIZE; 229 228 if (pages <= fp_old->pages) 230 229 return fp_old; 231 230 232 - delta = pages - fp_old->pages; 233 - ret = __bpf_prog_charge(fp_old->aux->user, delta); 234 - if (ret) 235 - return NULL; 236 - 237 231 fp = __vmalloc(size, gfp_flags); 238 - if (fp == NULL) { 239 - __bpf_prog_uncharge(fp_old->aux->user, delta); 240 - } else { 232 + if (fp) { 241 233 memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); 242 234 fp->pages = pages; 243 235 fp->aux->prog = fp; ··· 2203 2211 const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; 2204 2212 const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; 2205 2213 const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; 2214 + const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak; 2206 2215 2207 2216 const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; 2208 2217 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;

+13 -24

kernel/bpf/cpumap.c

··· 84 84 u32 value_size = attr->value_size; 85 85 struct bpf_cpu_map *cmap; 86 86 int err = -ENOMEM; 87 - u64 cost; 88 - int ret; 89 87 90 88 if (!bpf_capable()) 91 89 return ERR_PTR(-EPERM); ··· 95 97 attr->map_flags & ~BPF_F_NUMA_NODE) 96 98 return ERR_PTR(-EINVAL); 97 99 98 - cmap = kzalloc(sizeof(*cmap), GFP_USER); 100 + cmap = kzalloc(sizeof(*cmap), GFP_USER | __GFP_ACCOUNT); 99 101 if (!cmap) 100 102 return ERR_PTR(-ENOMEM); 101 103 ··· 107 109 goto free_cmap; 108 110 } 109 111 110 - /* make sure page count doesn't overflow */ 111 - cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); 112 - 113 - /* Notice returns -EPERM on if map size is larger than memlock limit */ 114 - ret = bpf_map_charge_init(&cmap->map.memory, cost); 115 - if (ret) { 116 - err = ret; 117 - goto free_cmap; 118 - } 119 - 120 112 /* Alloc array for possible remote "destination" CPUs */ 121 113 cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * 122 114 sizeof(struct bpf_cpu_map_entry *), 123 115 cmap->map.numa_node); 124 116 if (!cmap->cpu_map) 125 - goto free_charge; 117 + goto free_cmap; 126 118 127 119 return &cmap->map; 128 - free_charge: 129 - bpf_map_charge_finish(&cmap->map.memory); 130 120 free_cmap: 131 121 kfree(cmap); 132 122 return ERR_PTR(err); ··· 398 412 } 399 413 400 414 static struct bpf_cpu_map_entry * 401 - __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) 415 + __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, 416 + u32 cpu) 402 417 { 403 418 int numa, err, i, fd = value->bpf_prog.fd; 404 419 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; ··· 409 422 /* Have map->numa_node, but choose node of redirect target CPU */ 410 423 numa = cpu_to_node(cpu); 411 424 412 - rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa); 425 + rcpu = bpf_map_kmalloc_node(map, sizeof(*rcpu), gfp | __GFP_ZERO, numa); 413 426 if (!rcpu) 414 427 return NULL; 415 428 416 429 /* Alloc percpu bulkq */ 417 - rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq), 418 - sizeof(void *), gfp); 430 + rcpu->bulkq = bpf_map_alloc_percpu(map, sizeof(*rcpu->bulkq), 431 + sizeof(void *), gfp); 419 432 if (!rcpu->bulkq) 420 433 goto free_rcu; 421 434 ··· 425 438 } 426 439 427 440 /* Alloc queue */ 428 - rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); 441 + rcpu->queue = bpf_map_kmalloc_node(map, sizeof(*rcpu->queue), gfp, 442 + numa); 429 443 if (!rcpu->queue) 430 444 goto free_bulkq; 431 445 ··· 435 447 goto free_queue; 436 448 437 449 rcpu->cpu = cpu; 438 - rcpu->map_id = map_id; 450 + rcpu->map_id = map->id; 439 451 rcpu->value.qsize = value->qsize; 440 452 441 453 if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) ··· 443 455 444 456 /* Setup kthread */ 445 457 rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, 446 - "cpumap/%d/map:%d", cpu, map_id); 458 + "cpumap/%d/map:%d", cpu, 459 + map->id); 447 460 if (IS_ERR(rcpu->kthread)) 448 461 goto free_prog; 449 462 ··· 560 571 rcpu = NULL; /* Same as deleting */ 561 572 } else { 562 573 /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ 563 - rcpu = __cpu_map_entry_alloc(&cpumap_value, key_cpu, map->id); 574 + rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu); 564 575 if (!rcpu) 565 576 return -ENOMEM; 566 577 rcpu->cmap = cmap;

+6 -19

kernel/bpf/devmap.c

··· 109 109 static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) 110 110 { 111 111 u32 valsize = attr->value_size; 112 - u64 cost = 0; 113 - int err; 114 112 115 113 /* check sanity of attributes. 2 value sizes supported: 116 114 * 4 bytes: ifindex ··· 133 135 134 136 if (!dtab->n_buckets) /* Overflow check */ 135 137 return -EINVAL; 136 - cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets; 137 - } else { 138 - cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); 139 138 } 140 - 141 - /* if map size is larger than memlock limit, reject it */ 142 - err = bpf_map_charge_init(&dtab->map.memory, cost); 143 - if (err) 144 - return -EINVAL; 145 139 146 140 if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { 147 141 dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, 148 142 dtab->map.numa_node); 149 143 if (!dtab->dev_index_head) 150 - goto free_charge; 144 + return -ENOMEM; 151 145 152 146 spin_lock_init(&dtab->index_lock); 153 147 } else { ··· 147 157 sizeof(struct bpf_dtab_netdev *), 148 158 dtab->map.numa_node); 149 159 if (!dtab->netdev_map) 150 - goto free_charge; 160 + return -ENOMEM; 151 161 } 152 162 153 163 return 0; 154 - 155 - free_charge: 156 - bpf_map_charge_finish(&dtab->map.memory); 157 - return -ENOMEM; 158 164 } 159 165 160 166 static struct bpf_map *dev_map_alloc(union bpf_attr *attr) ··· 161 175 if (!capable(CAP_NET_ADMIN)) 162 176 return ERR_PTR(-EPERM); 163 177 164 - dtab = kzalloc(sizeof(*dtab), GFP_USER); 178 + dtab = kzalloc(sizeof(*dtab), GFP_USER | __GFP_ACCOUNT); 165 179 if (!dtab) 166 180 return ERR_PTR(-ENOMEM); 167 181 ··· 588 602 struct bpf_prog *prog = NULL; 589 603 struct bpf_dtab_netdev *dev; 590 604 591 - dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, 592 - dtab->map.numa_node); 605 + dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev), 606 + GFP_ATOMIC | __GFP_NOWARN, 607 + dtab->map.numa_node); 593 608 if (!dev) 594 609 return ERR_PTR(-ENOMEM); 595 610

+15 -28

kernel/bpf/hashtab.c

··· 292 292 u32 size = round_up(htab->map.value_size, 8); 293 293 void __percpu *pptr; 294 294 295 - pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN); 295 + pptr = bpf_map_alloc_percpu(&htab->map, size, 8, 296 + GFP_USER | __GFP_NOWARN); 296 297 if (!pptr) 297 298 goto free_elems; 298 299 htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, ··· 347 346 struct pcpu_freelist_node *l; 348 347 int cpu; 349 348 350 - pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8, 351 - GFP_USER | __GFP_NOWARN); 349 + pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8, 350 + GFP_USER | __GFP_NOWARN); 352 351 if (!pptr) 353 352 return -ENOMEM; 354 353 ··· 443 442 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 444 443 struct bpf_htab *htab; 445 444 int err, i; 446 - u64 cost; 447 445 448 - htab = kzalloc(sizeof(*htab), GFP_USER); 446 + htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT); 449 447 if (!htab) 450 448 return ERR_PTR(-ENOMEM); 451 449 ··· 480 480 htab->n_buckets > U32_MAX / sizeof(struct bucket)) 481 481 goto free_htab; 482 482 483 - cost = (u64) htab->n_buckets * sizeof(struct bucket) + 484 - (u64) htab->elem_size * htab->map.max_entries; 485 - 486 - if (percpu) 487 - cost += (u64) round_up(htab->map.value_size, 8) * 488 - num_possible_cpus() * htab->map.max_entries; 489 - else 490 - cost += (u64) htab->elem_size * num_possible_cpus(); 491 - 492 - /* if map size is larger than memlock limit, reject it */ 493 - err = bpf_map_charge_init(&htab->map.memory, cost); 494 - if (err) 495 - goto free_htab; 496 - 497 483 err = -ENOMEM; 498 484 htab->buckets = bpf_map_area_alloc(htab->n_buckets * 499 485 sizeof(struct bucket), 500 486 htab->map.numa_node); 501 487 if (!htab->buckets) 502 - goto free_charge; 488 + goto free_htab; 503 489 504 490 for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { 505 - htab->map_locked[i] = __alloc_percpu_gfp(sizeof(int), 506 - sizeof(int), GFP_USER); 491 + htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map, 492 + sizeof(int), 493 + sizeof(int), 494 + GFP_USER); 507 495 if (!htab->map_locked[i]) 508 496 goto free_map_locked; 509 497 } ··· 526 538 for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) 527 539 free_percpu(htab->map_locked[i]); 528 540 bpf_map_area_free(htab->buckets); 529 - free_charge: 530 - bpf_map_charge_finish(&htab->map.memory); 531 541 free_htab: 532 542 lockdep_unregister_key(&htab->lockdep_key); 533 543 kfree(htab); ··· 911 925 l_new = ERR_PTR(-E2BIG); 912 926 goto dec_count; 913 927 } 914 - l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, 915 - htab->map.numa_node); 928 + l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, 929 + GFP_ATOMIC | __GFP_NOWARN, 930 + htab->map.numa_node); 916 931 if (!l_new) { 917 932 l_new = ERR_PTR(-ENOMEM); 918 933 goto dec_count; ··· 929 942 pptr = htab_elem_get_ptr(l_new, key_size); 930 943 } else { 931 944 /* alloc_percpu zero-fills */ 932 - pptr = __alloc_percpu_gfp(size, 8, 933 - GFP_ATOMIC | __GFP_NOWARN); 945 + pptr = bpf_map_alloc_percpu(&htab->map, size, 8, 946 + GFP_ATOMIC | __GFP_NOWARN); 934 947 if (!pptr) { 935 948 kfree(l_new); 936 949 l_new = ERR_PTR(-ENOMEM);

+13

kernel/bpf/helpers.c

··· 167 167 .ret_type = RET_INTEGER, 168 168 }; 169 169 170 + BPF_CALL_0(bpf_ktime_get_coarse_ns) 171 + { 172 + return ktime_get_coarse_ns(); 173 + } 174 + 175 + const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = { 176 + .func = bpf_ktime_get_coarse_ns, 177 + .gpl_only = false, 178 + .ret_type = RET_INTEGER, 179 + }; 180 + 170 181 BPF_CALL_0(bpf_get_current_pid_tgid) 171 182 { 172 183 struct task_struct *task = current; ··· 696 685 return &bpf_ktime_get_ns_proto; 697 686 case BPF_FUNC_ktime_get_boot_ns: 698 687 return &bpf_ktime_get_boot_ns_proto; 688 + case BPF_FUNC_ktime_get_coarse_ns: 689 + return &bpf_ktime_get_coarse_ns_proto; 699 690 case BPF_FUNC_ringbuf_output: 700 691 return &bpf_ringbuf_output_proto; 701 692 case BPF_FUNC_ringbuf_reserve:

+12 -32

kernel/bpf/local_storage.c

··· 164 164 return 0; 165 165 } 166 166 167 - new = kmalloc_node(sizeof(struct bpf_storage_buffer) + 168 - map->value_size, 169 - __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, 170 - map->numa_node); 167 + new = bpf_map_kmalloc_node(map, sizeof(struct bpf_storage_buffer) + 168 + map->value_size, 169 + __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, 170 + map->numa_node); 171 171 if (!new) 172 172 return -ENOMEM; 173 173 ··· 287 287 { 288 288 int numa_node = bpf_map_attr_numa_node(attr); 289 289 struct bpf_cgroup_storage_map *map; 290 - struct bpf_map_memory mem; 291 - int ret; 292 290 293 291 if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) && 294 292 attr->key_size != sizeof(__u64)) ··· 306 308 /* max_entries is not used and enforced to be 0 */ 307 309 return ERR_PTR(-EINVAL); 308 310 309 - ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map)); 310 - if (ret < 0) 311 - return ERR_PTR(ret); 312 - 313 311 map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), 314 - __GFP_ZERO | GFP_USER, numa_node); 315 - if (!map) { 316 - bpf_map_charge_finish(&mem); 312 + __GFP_ZERO | GFP_USER | __GFP_ACCOUNT, numa_node); 313 + if (!map) 317 314 return ERR_PTR(-ENOMEM); 318 - } 319 - 320 - bpf_map_charge_move(&map->map.memory, &mem); 321 315 322 316 /* copy mandatory map attributes */ 323 317 bpf_map_init_from_attr(&map->map, attr); ··· 486 496 struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, 487 497 enum bpf_cgroup_storage_type stype) 488 498 { 499 + const gfp_t gfp = __GFP_ZERO | GFP_USER; 489 500 struct bpf_cgroup_storage *storage; 490 501 struct bpf_map *map; 491 - gfp_t flags; 492 502 size_t size; 493 503 u32 pages; 494 504 ··· 498 508 499 509 size = bpf_cgroup_storage_calculate_size(map, &pages); 500 510 501 - if (bpf_map_charge_memlock(map, pages)) 502 - return ERR_PTR(-EPERM); 503 - 504 - storage = kmalloc_node(sizeof(struct bpf_cgroup_storage), 505 - __GFP_ZERO | GFP_USER, map->numa_node); 511 + storage = bpf_map_kmalloc_node(map, sizeof(struct bpf_cgroup_storage), 512 + gfp, map->numa_node); 506 513 if (!storage) 507 514 goto enomem; 508 515 509 - flags = __GFP_ZERO | GFP_USER; 510 - 511 516 if (stype == BPF_CGROUP_STORAGE_SHARED) { 512 - storage->buf = kmalloc_node(size, flags, map->numa_node); 517 + storage->buf = bpf_map_kmalloc_node(map, size, gfp, 518 + map->numa_node); 513 519 if (!storage->buf) 514 520 goto enomem; 515 521 check_and_init_map_lock(map, storage->buf->data); 516 522 } else { 517 - storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags); 523 + storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp); 518 524 if (!storage->percpu_buf) 519 525 goto enomem; 520 526 } ··· 520 534 return storage; 521 535 522 536 enomem: 523 - bpf_map_uncharge_memlock(map, pages); 524 537 kfree(storage); 525 538 return ERR_PTR(-ENOMEM); 526 539 } ··· 546 561 { 547 562 enum bpf_cgroup_storage_type stype; 548 563 struct bpf_map *map; 549 - u32 pages; 550 564 551 565 if (!storage) 552 566 return; 553 567 554 568 map = &storage->map->map; 555 - 556 - bpf_cgroup_storage_calculate_size(map, &pages); 557 - bpf_map_uncharge_memlock(map, pages); 558 - 559 569 stype = cgroup_storage_type(map); 560 570 if (stype == BPF_CGROUP_STORAGE_SHARED) 561 571 call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu);

+3 -16

kernel/bpf/lpm_trie.c

··· 282 282 if (value) 283 283 size += trie->map.value_size; 284 284 285 - node = kmalloc_node(size, GFP_ATOMIC | __GFP_NOWARN, 286 - trie->map.numa_node); 285 + node = bpf_map_kmalloc_node(&trie->map, size, GFP_ATOMIC | __GFP_NOWARN, 286 + trie->map.numa_node); 287 287 if (!node) 288 288 return NULL; 289 289 ··· 540 540 static struct bpf_map *trie_alloc(union bpf_attr *attr) 541 541 { 542 542 struct lpm_trie *trie; 543 - u64 cost = sizeof(*trie), cost_per_node; 544 - int ret; 545 543 546 544 if (!bpf_capable()) 547 545 return ERR_PTR(-EPERM); ··· 555 557 attr->value_size > LPM_VAL_SIZE_MAX) 556 558 return ERR_PTR(-EINVAL); 557 559 558 - trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN); 560 + trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); 559 561 if (!trie) 560 562 return ERR_PTR(-ENOMEM); 561 563 ··· 565 567 offsetof(struct bpf_lpm_trie_key, data); 566 568 trie->max_prefixlen = trie->data_size * 8; 567 569 568 - cost_per_node = sizeof(struct lpm_trie_node) + 569 - attr->value_size + trie->data_size; 570 - cost += (u64) attr->max_entries * cost_per_node; 571 - 572 - ret = bpf_map_charge_init(&trie->map.memory, cost); 573 - if (ret) 574 - goto out_err; 575 - 576 570 spin_lock_init(&trie->lock); 577 571 578 572 return &trie->map; 579 - out_err: 580 - kfree(trie); 581 - return ERR_PTR(ret); 582 573 } 583 574 584 575 static void trie_free(struct bpf_map *map)

+4 -12

kernel/bpf/queue_stack_maps.c

··· 66 66 67 67 static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) 68 68 { 69 - int ret, numa_node = bpf_map_attr_numa_node(attr); 70 - struct bpf_map_memory mem = {0}; 69 + int numa_node = bpf_map_attr_numa_node(attr); 71 70 struct bpf_queue_stack *qs; 72 - u64 size, queue_size, cost; 71 + u64 size, queue_size; 73 72 74 73 size = (u64) attr->max_entries + 1; 75 - cost = queue_size = sizeof(*qs) + size * attr->value_size; 76 - 77 - ret = bpf_map_charge_init(&mem, cost); 78 - if (ret < 0) 79 - return ERR_PTR(ret); 74 + queue_size = sizeof(*qs) + size * attr->value_size; 80 75 81 76 qs = bpf_map_area_alloc(queue_size, numa_node); 82 - if (!qs) { 83 - bpf_map_charge_finish(&mem); 77 + if (!qs) 84 78 return ERR_PTR(-ENOMEM); 85 - } 86 79 87 80 memset(qs, 0, sizeof(*qs)); 88 81 89 82 bpf_map_init_from_attr(&qs->map, attr); 90 83 91 - bpf_map_charge_move(&qs->map.memory, &mem); 92 84 qs->size = size; 93 85 94 86 raw_spin_lock_init(&qs->lock);

+2 -10

kernel/bpf/reuseport_array.c

··· 150 150 151 151 static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) 152 152 { 153 - int err, numa_node = bpf_map_attr_numa_node(attr); 153 + int numa_node = bpf_map_attr_numa_node(attr); 154 154 struct reuseport_array *array; 155 - struct bpf_map_memory mem; 156 155 u64 array_size; 157 156 158 157 if (!bpf_capable()) ··· 160 161 array_size = sizeof(*array); 161 162 array_size += (u64)attr->max_entries * sizeof(struct sock *); 162 163 163 - err = bpf_map_charge_init(&mem, array_size); 164 - if (err) 165 - return ERR_PTR(err); 166 - 167 164 /* allocate all map elements and zero-initialize them */ 168 165 array = bpf_map_area_alloc(array_size, numa_node); 169 - if (!array) { 170 - bpf_map_charge_finish(&mem); 166 + if (!array) 171 167 return ERR_PTR(-ENOMEM); 172 - } 173 168 174 169 /* copy mandatory map attributes */ 175 170 bpf_map_init_from_attr(&array->map, attr); 176 - bpf_map_charge_move(&array->map.memory, &mem); 177 171 178 172 return &array->map; 179 173 }

+8 -27

kernel/bpf/ringbuf.c

··· 48 48 49 49 struct bpf_ringbuf_map { 50 50 struct bpf_map map; 51 - struct bpf_map_memory memory; 52 51 struct bpf_ringbuf *rb; 53 52 }; 54 53 ··· 59 60 60 61 static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) 61 62 { 62 - const gfp_t flags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | 63 - __GFP_ZERO; 63 + const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL | 64 + __GFP_NOWARN | __GFP_ZERO; 64 65 int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES; 65 66 int nr_data_pages = data_sz >> PAGE_SHIFT; 66 67 int nr_pages = nr_meta_pages + nr_data_pages; ··· 87 88 * user-space implementations significantly. 88 89 */ 89 90 array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages); 90 - if (array_size > PAGE_SIZE) 91 - pages = vmalloc_node(array_size, numa_node); 92 - else 93 - pages = kmalloc_node(array_size, flags, numa_node); 91 + pages = bpf_map_area_alloc(array_size, numa_node); 94 92 if (!pages) 95 93 return NULL; 96 94 ··· 130 134 131 135 rb = bpf_ringbuf_area_alloc(data_sz, numa_node); 132 136 if (!rb) 133 - return ERR_PTR(-ENOMEM); 137 + return NULL; 134 138 135 139 spin_lock_init(&rb->spinlock); 136 140 init_waitqueue_head(&rb->waitq); ··· 146 150 static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) 147 151 { 148 152 struct bpf_ringbuf_map *rb_map; 149 - u64 cost; 150 - int err; 151 153 152 154 if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) 153 155 return ERR_PTR(-EINVAL); ··· 161 167 return ERR_PTR(-E2BIG); 162 168 #endif 163 169 164 - rb_map = kzalloc(sizeof(*rb_map), GFP_USER); 170 + rb_map = kzalloc(sizeof(*rb_map), GFP_USER | __GFP_ACCOUNT); 165 171 if (!rb_map) 166 172 return ERR_PTR(-ENOMEM); 167 173 168 174 bpf_map_init_from_attr(&rb_map->map, attr); 169 175 170 - cost = sizeof(struct bpf_ringbuf_map) + 171 - sizeof(struct bpf_ringbuf) + 172 - attr->max_entries; 173 - err = bpf_map_charge_init(&rb_map->map.memory, cost); 174 - if (err) 175 - goto err_free_map; 176 - 177 176 rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); 178 - if (IS_ERR(rb_map->rb)) { 179 - err = PTR_ERR(rb_map->rb); 180 - goto err_uncharge; 177 + if (!rb_map->rb) { 178 + kfree(rb_map); 179 + return ERR_PTR(-ENOMEM); 181 180 } 182 181 183 182 return &rb_map->map; 184 - 185 - err_uncharge: 186 - bpf_map_charge_finish(&rb_map->map.memory); 187 - err_free_map: 188 - kfree(rb_map); 189 - return ERR_PTR(err); 190 183 } 191 184 192 185 static void bpf_ringbuf_free(struct bpf_ringbuf *rb)

+3 -13

kernel/bpf/stackmap.c

··· 90 90 { 91 91 u32 value_size = attr->value_size; 92 92 struct bpf_stack_map *smap; 93 - struct bpf_map_memory mem; 94 93 u64 cost, n_buckets; 95 94 int err; 96 95 ··· 118 119 119 120 cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); 120 121 cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); 121 - err = bpf_map_charge_init(&mem, cost); 122 - if (err) 123 - return ERR_PTR(err); 124 - 125 122 smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); 126 - if (!smap) { 127 - bpf_map_charge_finish(&mem); 123 + if (!smap) 128 124 return ERR_PTR(-ENOMEM); 129 - } 130 125 131 126 bpf_map_init_from_attr(&smap->map, attr); 132 127 smap->map.value_size = value_size; ··· 128 135 129 136 err = get_callchain_buffers(sysctl_perf_event_max_stack); 130 137 if (err) 131 - goto free_charge; 138 + goto free_smap; 132 139 133 140 err = prealloc_elems_and_freelist(smap); 134 141 if (err) 135 142 goto put_buffers; 136 143 137 - bpf_map_charge_move(&smap->map.memory, &mem); 138 - 139 144 return &smap->map; 140 145 141 146 put_buffers: 142 147 put_callchain_buffers(); 143 - free_charge: 144 - bpf_map_charge_finish(&mem); 148 + free_smap: 145 149 bpf_map_area_free(smap); 146 150 return ERR_PTR(err); 147 151 }

+152 -160

kernel/bpf/syscall.c

··· 31 31 #include <linux/poll.h> 32 32 #include <linux/bpf-netns.h> 33 33 #include <linux/rcupdate_trace.h> 34 + #include <linux/memcontrol.h> 34 35 35 36 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ 36 37 (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ ··· 128 127 return map; 129 128 } 130 129 131 - static u32 bpf_map_value_size(struct bpf_map *map) 130 + static u32 bpf_map_value_size(const struct bpf_map *map) 132 131 { 133 132 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 134 133 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || ··· 268 267 return err; 269 268 } 270 269 270 + /* Please, do not use this function outside from the map creation path 271 + * (e.g. in map update path) without taking care of setting the active 272 + * memory cgroup (see at bpf_map_kmalloc_node() for example). 273 + */ 271 274 static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) 272 275 { 273 276 /* We really just want to fail instead of triggering OOM killer ··· 284 279 * __GFP_RETRY_MAYFAIL to avoid such situations. 285 280 */ 286 281 287 - const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO; 282 + const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT; 288 283 unsigned int flags = 0; 289 284 unsigned long align = 1; 290 285 void *area; ··· 346 341 map->numa_node = bpf_map_attr_numa_node(attr); 347 342 } 348 343 349 - static int bpf_charge_memlock(struct user_struct *user, u32 pages) 350 - { 351 - unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 352 - 353 - if (atomic_long_add_return(pages, &user->locked_vm) > memlock_limit) { 354 - atomic_long_sub(pages, &user->locked_vm); 355 - return -EPERM; 356 - } 357 - return 0; 358 - } 359 - 360 - static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) 361 - { 362 - if (user) 363 - atomic_long_sub(pages, &user->locked_vm); 364 - } 365 - 366 - int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size) 367 - { 368 - u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT; 369 - struct user_struct *user; 370 - int ret; 371 - 372 - if (size >= U32_MAX - PAGE_SIZE) 373 - return -E2BIG; 374 - 375 - user = get_current_user(); 376 - ret = bpf_charge_memlock(user, pages); 377 - if (ret) { 378 - free_uid(user); 379 - return ret; 380 - } 381 - 382 - mem->pages = pages; 383 - mem->user = user; 384 - 385 - return 0; 386 - } 387 - 388 - void bpf_map_charge_finish(struct bpf_map_memory *mem) 389 - { 390 - bpf_uncharge_memlock(mem->user, mem->pages); 391 - free_uid(mem->user); 392 - } 393 - 394 - void bpf_map_charge_move(struct bpf_map_memory *dst, 395 - struct bpf_map_memory *src) 396 - { 397 - *dst = *src; 398 - 399 - /* Make sure src will not be used for the redundant uncharging. */ 400 - memset(src, 0, sizeof(struct bpf_map_memory)); 401 - } 402 - 403 - int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) 404 - { 405 - int ret; 406 - 407 - ret = bpf_charge_memlock(map->memory.user, pages); 408 - if (ret) 409 - return ret; 410 - map->memory.pages += pages; 411 - return ret; 412 - } 413 - 414 - void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) 415 - { 416 - bpf_uncharge_memlock(map->memory.user, pages); 417 - map->memory.pages -= pages; 418 - } 419 - 420 344 static int bpf_map_alloc_id(struct bpf_map *map) 421 345 { 422 346 int id; ··· 390 456 __release(&map_idr_lock); 391 457 } 392 458 459 + #ifdef CONFIG_MEMCG_KMEM 460 + static void bpf_map_save_memcg(struct bpf_map *map) 461 + { 462 + map->memcg = get_mem_cgroup_from_mm(current->mm); 463 + } 464 + 465 + static void bpf_map_release_memcg(struct bpf_map *map) 466 + { 467 + mem_cgroup_put(map->memcg); 468 + } 469 + 470 + void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, 471 + int node) 472 + { 473 + struct mem_cgroup *old_memcg; 474 + void *ptr; 475 + 476 + old_memcg = set_active_memcg(map->memcg); 477 + ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); 478 + set_active_memcg(old_memcg); 479 + 480 + return ptr; 481 + } 482 + 483 + void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) 484 + { 485 + struct mem_cgroup *old_memcg; 486 + void *ptr; 487 + 488 + old_memcg = set_active_memcg(map->memcg); 489 + ptr = kzalloc(size, flags | __GFP_ACCOUNT); 490 + set_active_memcg(old_memcg); 491 + 492 + return ptr; 493 + } 494 + 495 + void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, 496 + size_t align, gfp_t flags) 497 + { 498 + struct mem_cgroup *old_memcg; 499 + void __percpu *ptr; 500 + 501 + old_memcg = set_active_memcg(map->memcg); 502 + ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT); 503 + set_active_memcg(old_memcg); 504 + 505 + return ptr; 506 + } 507 + 508 + #else 509 + static void bpf_map_save_memcg(struct bpf_map *map) 510 + { 511 + } 512 + 513 + static void bpf_map_release_memcg(struct bpf_map *map) 514 + { 515 + } 516 + #endif 517 + 393 518 /* called from workqueue */ 394 519 static void bpf_map_free_deferred(struct work_struct *work) 395 520 { 396 521 struct bpf_map *map = container_of(work, struct bpf_map, work); 397 - struct bpf_map_memory mem; 398 522 399 - bpf_map_charge_move(&mem, &map->memory); 400 523 security_bpf_map_free(map); 524 + bpf_map_release_memcg(map); 401 525 /* implementation dependent freeing */ 402 526 map->ops->map_free(map); 403 - bpf_map_charge_finish(&mem); 404 527 } 405 528 406 529 static void bpf_map_put_uref(struct bpf_map *map) ··· 518 527 } 519 528 520 529 #ifdef CONFIG_PROC_FS 530 + /* Provides an approximation of the map's memory footprint. 531 + * Used only to provide a backward compatibility and display 532 + * a reasonable "memlock" info. 533 + */ 534 + static unsigned long bpf_map_memory_footprint(const struct bpf_map *map) 535 + { 536 + unsigned long size; 537 + 538 + size = round_up(map->key_size + bpf_map_value_size(map), 8); 539 + 540 + return round_up(map->max_entries * size, PAGE_SIZE); 541 + } 542 + 521 543 static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) 522 544 { 523 545 const struct bpf_map *map = filp->private_data; ··· 549 545 "value_size:\t%u\n" 550 546 "max_entries:\t%u\n" 551 547 "map_flags:\t%#x\n" 552 - "memlock:\t%llu\n" 548 + "memlock:\t%lu\n" 553 549 "map_id:\t%u\n" 554 550 "frozen:\t%u\n", 555 551 map->map_type, ··· 557 553 map->value_size, 558 554 map->max_entries, 559 555 map->map_flags, 560 - map->memory.pages * 1ULL << PAGE_SHIFT, 556 + bpf_map_memory_footprint(map), 561 557 map->id, 562 558 READ_ONCE(map->frozen)); 563 559 if (type) { ··· 800 796 static int map_create(union bpf_attr *attr) 801 797 { 802 798 int numa_node = bpf_map_attr_numa_node(attr); 803 - struct bpf_map_memory mem; 804 799 struct bpf_map *map; 805 800 int f_flags; 806 801 int err; ··· 878 875 if (err) 879 876 goto free_map_sec; 880 877 878 + bpf_map_save_memcg(map); 879 + 881 880 err = bpf_map_new_fd(map, f_flags); 882 881 if (err < 0) { 883 882 /* failed to allocate fd. ··· 898 893 security_bpf_map_free(map); 899 894 free_map: 900 895 btf_put(map->btf); 901 - bpf_map_charge_move(&mem, &map->memory); 902 896 map->ops->map_free(map); 903 - bpf_map_charge_finish(&mem); 904 897 return err; 905 898 } 906 899 ··· 1632 1629 audit_log_end(ab); 1633 1630 } 1634 1631 1635 - int __bpf_prog_charge(struct user_struct *user, u32 pages) 1636 - { 1637 - unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 1638 - unsigned long user_bufs; 1639 - 1640 - if (user) { 1641 - user_bufs = atomic_long_add_return(pages, &user->locked_vm); 1642 - if (user_bufs > memlock_limit) { 1643 - atomic_long_sub(pages, &user->locked_vm); 1644 - return -EPERM; 1645 - } 1646 - } 1647 - 1648 - return 0; 1649 - } 1650 - 1651 - void __bpf_prog_uncharge(struct user_struct *user, u32 pages) 1652 - { 1653 - if (user) 1654 - atomic_long_sub(pages, &user->locked_vm); 1655 - } 1656 - 1657 - static int bpf_prog_charge_memlock(struct bpf_prog *prog) 1658 - { 1659 - struct user_struct *user = get_current_user(); 1660 - int ret; 1661 - 1662 - ret = __bpf_prog_charge(user, prog->pages); 1663 - if (ret) { 1664 - free_uid(user); 1665 - return ret; 1666 - } 1667 - 1668 - prog->aux->user = user; 1669 - return 0; 1670 - } 1671 - 1672 - static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) 1673 - { 1674 - struct user_struct *user = prog->aux->user; 1675 - 1676 - __bpf_prog_uncharge(user, prog->pages); 1677 - free_uid(user); 1678 - } 1679 - 1680 1632 static int bpf_prog_alloc_id(struct bpf_prog *prog) 1681 1633 { 1682 1634 int id; ··· 1681 1723 1682 1724 kvfree(aux->func_info); 1683 1725 kfree(aux->func_info_aux); 1684 - bpf_prog_uncharge_memlock(aux->prog); 1726 + free_uid(aux->user); 1685 1727 security_bpf_prog_free(aux); 1686 1728 bpf_prog_free(aux->prog); 1687 1729 } ··· 1691 1733 bpf_prog_kallsyms_del_all(prog); 1692 1734 btf_put(prog->aux->btf); 1693 1735 bpf_prog_free_linfo(prog); 1736 + if (prog->aux->attach_btf) 1737 + btf_put(prog->aux->attach_btf); 1694 1738 1695 1739 if (deferred) { 1696 1740 if (prog->aux->sleepable) ··· 1926 1966 static int 1927 1967 bpf_prog_load_check_attach(enum bpf_prog_type prog_type, 1928 1968 enum bpf_attach_type expected_attach_type, 1929 - u32 btf_id, u32 prog_fd) 1969 + struct btf *attach_btf, u32 btf_id, 1970 + struct bpf_prog *dst_prog) 1930 1971 { 1931 1972 if (btf_id) { 1932 1973 if (btf_id > BTF_MAX_TYPE) 1974 + return -EINVAL; 1975 + 1976 + if (!attach_btf && !dst_prog) 1933 1977 return -EINVAL; 1934 1978 1935 1979 switch (prog_type) { ··· 1947 1983 } 1948 1984 } 1949 1985 1950 - if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING && 1986 + if (attach_btf && (!btf_id || dst_prog)) 1987 + return -EINVAL; 1988 + 1989 + if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING && 1951 1990 prog_type != BPF_PROG_TYPE_EXT) 1952 1991 return -EINVAL; 1953 1992 ··· 2067 2100 static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) 2068 2101 { 2069 2102 enum bpf_prog_type type = attr->prog_type; 2070 - struct bpf_prog *prog; 2103 + struct bpf_prog *prog, *dst_prog = NULL; 2104 + struct btf *attach_btf = NULL; 2071 2105 int err; 2072 2106 char license[128]; 2073 2107 bool is_gpl; ··· 2110 2142 if (is_perfmon_prog_type(type) && !perfmon_capable()) 2111 2143 return -EPERM; 2112 2144 2145 + /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog 2146 + * or btf, we need to check which one it is 2147 + */ 2148 + if (attr->attach_prog_fd) { 2149 + dst_prog = bpf_prog_get(attr->attach_prog_fd); 2150 + if (IS_ERR(dst_prog)) { 2151 + dst_prog = NULL; 2152 + attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd); 2153 + if (IS_ERR(attach_btf)) 2154 + return -EINVAL; 2155 + if (!btf_is_kernel(attach_btf)) { 2156 + btf_put(attach_btf); 2157 + return -EINVAL; 2158 + } 2159 + } 2160 + } else if (attr->attach_btf_id) { 2161 + /* fall back to vmlinux BTF, if BTF type ID is specified */ 2162 + attach_btf = bpf_get_btf_vmlinux(); 2163 + if (IS_ERR(attach_btf)) 2164 + return PTR_ERR(attach_btf); 2165 + if (!attach_btf) 2166 + return -EINVAL; 2167 + btf_get(attach_btf); 2168 + } 2169 + 2113 2170 bpf_prog_load_fixup_attach_type(attr); 2114 2171 if (bpf_prog_load_check_attach(type, attr->expected_attach_type, 2115 - attr->attach_btf_id, 2116 - attr->attach_prog_fd)) 2172 + attach_btf, attr->attach_btf_id, 2173 + dst_prog)) { 2174 + if (dst_prog) 2175 + bpf_prog_put(dst_prog); 2176 + if (attach_btf) 2177 + btf_put(attach_btf); 2117 2178 return -EINVAL; 2179 + } 2118 2180 2119 2181 /* plain bpf_prog allocation */ 2120 2182 prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); 2121 - if (!prog) 2183 + if (!prog) { 2184 + if (dst_prog) 2185 + bpf_prog_put(dst_prog); 2186 + if (attach_btf) 2187 + btf_put(attach_btf); 2122 2188 return -ENOMEM; 2123 - 2124 - prog->expected_attach_type = attr->expected_attach_type; 2125 - prog->aux->attach_btf_id = attr->attach_btf_id; 2126 - if (attr->attach_prog_fd) { 2127 - struct bpf_prog *dst_prog; 2128 - 2129 - dst_prog = bpf_prog_get(attr->attach_prog_fd); 2130 - if (IS_ERR(dst_prog)) { 2131 - err = PTR_ERR(dst_prog); 2132 - goto free_prog_nouncharge; 2133 - } 2134 - prog->aux->dst_prog = dst_prog; 2135 2189 } 2136 2190 2191 + prog->expected_attach_type = attr->expected_attach_type; 2192 + prog->aux->attach_btf = attach_btf; 2193 + prog->aux->attach_btf_id = attr->attach_btf_id; 2194 + prog->aux->dst_prog = dst_prog; 2137 2195 prog->aux->offload_requested = !!attr->prog_ifindex; 2138 2196 prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; 2139 2197 2140 2198 err = security_bpf_prog_alloc(prog->aux); 2141 2199 if (err) 2142 - goto free_prog_nouncharge; 2200 + goto free_prog; 2143 2201 2144 - err = bpf_prog_charge_memlock(prog); 2145 - if (err) 2146 - goto free_prog_sec; 2147 - 2202 + prog->aux->user = get_current_user(); 2148 2203 prog->len = attr->insn_cnt; 2149 2204 2150 2205 err = -EFAULT; 2151 2206 if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns), 2152 2207 bpf_prog_insn_size(prog)) != 0) 2153 - goto free_prog; 2208 + goto free_prog_sec; 2154 2209 2155 2210 prog->orig_prog = NULL; 2156 2211 prog->jited = 0; ··· 2184 2193 if (bpf_prog_is_dev_bound(prog->aux)) { 2185 2194 err = bpf_prog_offload_init(prog, attr); 2186 2195 if (err) 2187 - goto free_prog; 2196 + goto free_prog_sec; 2188 2197 } 2189 2198 2190 2199 /* find program type: socket_filter vs tracing_filter */ 2191 2200 err = find_prog_type(type, prog); 2192 2201 if (err < 0) 2193 - goto free_prog; 2202 + goto free_prog_sec; 2194 2203 2195 2204 prog->aux->load_time = ktime_get_boottime_ns(); 2196 2205 err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, 2197 2206 sizeof(attr->prog_name)); 2198 2207 if (err < 0) 2199 - goto free_prog; 2208 + goto free_prog_sec; 2200 2209 2201 2210 /* run eBPF verifier */ 2202 2211 err = bpf_check(&prog, attr, uattr); ··· 2241 2250 */ 2242 2251 __bpf_prog_put_noref(prog, prog->aux->func_cnt); 2243 2252 return err; 2244 - free_prog: 2245 - bpf_prog_uncharge_memlock(prog); 2246 2253 free_prog_sec: 2254 + free_uid(prog->aux->user); 2247 2255 security_bpf_prog_free(prog->aux); 2248 - free_prog_nouncharge: 2256 + free_prog: 2257 + if (prog->aux->attach_btf) 2258 + btf_put(prog->aux->attach_btf); 2249 2259 bpf_prog_free(prog); 2250 2260 return err; 2251 2261 } ··· 2604 2612 goto out_put_prog; 2605 2613 } 2606 2614 2607 - key = bpf_trampoline_compute_key(tgt_prog, btf_id); 2615 + key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); 2608 2616 } 2609 2617 2610 2618 link = kzalloc(sizeof(*link), GFP_USER); ··· 3581 3589 } 3582 3590 3583 3591 if (prog->aux->btf) 3584 - info.btf_id = btf_id(prog->aux->btf); 3592 + info.btf_id = btf_obj_id(prog->aux->btf); 3585 3593 3586 3594 ulen = info.nr_func_info; 3587 3595 info.nr_func_info = prog->aux->func_info_cnt; ··· 3684 3692 memcpy(info.name, map->name, sizeof(map->name)); 3685 3693 3686 3694 if (map->btf) { 3687 - info.btf_id = btf_id(map->btf); 3695 + info.btf_id = btf_obj_id(map->btf); 3688 3696 info.btf_key_type_id = map->btf_key_type_id; 3689 3697 info.btf_value_type_id = map->btf_value_type_id; 3690 3698 }

+17 -37

kernel/bpf/task_iter.c

··· 136 136 }; 137 137 138 138 static struct file * 139 - task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info, 140 - struct task_struct **task, struct files_struct **fstruct) 139 + task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) 141 140 { 142 141 struct pid_namespace *ns = info->common.ns; 143 142 u32 curr_tid = info->tid, max_fds; ··· 149 150 * Otherwise, it does not hold any reference. 150 151 */ 151 152 again: 152 - if (*task) { 153 - curr_task = *task; 154 - curr_files = *fstruct; 153 + if (info->task) { 154 + curr_task = info->task; 155 + curr_files = info->files; 155 156 curr_fd = info->fd; 156 157 } else { 157 158 curr_task = task_seq_get_next(ns, &curr_tid, true); 158 - if (!curr_task) 159 + if (!curr_task) { 160 + info->task = NULL; 161 + info->files = NULL; 159 162 return NULL; 163 + } 160 164 161 165 curr_files = get_files_struct(curr_task); 162 166 if (!curr_files) { ··· 169 167 goto again; 170 168 } 171 169 172 - /* set *fstruct, *task and info->tid */ 173 - *fstruct = curr_files; 174 - *task = curr_task; 170 + info->files = curr_files; 171 + info->task = curr_task; 175 172 if (curr_tid == info->tid) { 176 173 curr_fd = info->fd; 177 174 } else { ··· 200 199 rcu_read_unlock(); 201 200 put_files_struct(curr_files); 202 201 put_task_struct(curr_task); 203 - *task = NULL; 204 - *fstruct = NULL; 202 + info->task = NULL; 203 + info->files = NULL; 205 204 info->fd = 0; 206 205 curr_tid = ++(info->tid); 207 206 goto again; ··· 210 209 static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) 211 210 { 212 211 struct bpf_iter_seq_task_file_info *info = seq->private; 213 - struct files_struct *files = NULL; 214 - struct task_struct *task = NULL; 215 212 struct file *file; 216 213 217 - file = task_file_seq_get_next(info, &task, &files); 218 - if (!file) { 219 - info->files = NULL; 220 - info->task = NULL; 221 - return NULL; 222 - } 223 - 224 - if (*pos == 0) 214 + info->task = NULL; 215 + info->files = NULL; 216 + file = task_file_seq_get_next(info); 217 + if (file && *pos == 0) 225 218 ++*pos; 226 - info->task = task; 227 - info->files = files; 228 219 229 220 return file; 230 221 } ··· 224 231 static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos) 225 232 { 226 233 struct bpf_iter_seq_task_file_info *info = seq->private; 227 - struct files_struct *files = info->files; 228 - struct task_struct *task = info->task; 229 - struct file *file; 230 234 231 235 ++*pos; 232 236 ++info->fd; 233 237 fput((struct file *)v); 234 - file = task_file_seq_get_next(info, &task, &files); 235 - if (!file) { 236 - info->files = NULL; 237 - info->task = NULL; 238 - return NULL; 239 - } 240 - 241 - info->task = task; 242 - info->files = files; 243 - 244 - return file; 238 + return task_file_seq_get_next(info); 245 239 } 246 240 247 241 struct bpf_iter__task_file {

+141 -113

kernel/bpf/verifier.c

··· 238 238 u64 msize_max_value; 239 239 int ref_obj_id; 240 240 int func_id; 241 + struct btf *btf; 241 242 u32 btf_id; 243 + struct btf *ret_btf; 242 244 u32 ret_btf_id; 243 245 }; 244 246 ··· 558 556 return cur->frame[reg->frameno]; 559 557 } 560 558 561 - const char *kernel_type_name(u32 id) 559 + static const char *kernel_type_name(const struct btf* btf, u32 id) 562 560 { 563 - return btf_name_by_offset(btf_vmlinux, 564 - btf_type_by_id(btf_vmlinux, id)->name_off); 561 + return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); 565 562 } 566 563 567 564 static void print_verifier_state(struct bpf_verifier_env *env, ··· 590 589 if (t == PTR_TO_BTF_ID || 591 590 t == PTR_TO_BTF_ID_OR_NULL || 592 591 t == PTR_TO_PERCPU_BTF_ID) 593 - verbose(env, "%s", kernel_type_name(reg->btf_id)); 592 + verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id)); 594 593 verbose(env, "(id=%d", reg->id); 595 594 if (reg_type_may_be_refcounted_or_null(t)) 596 595 verbose(env, ",ref_obj_id=%d", reg->ref_obj_id); ··· 1384 1383 1385 1384 static void mark_btf_ld_reg(struct bpf_verifier_env *env, 1386 1385 struct bpf_reg_state *regs, u32 regno, 1387 - enum bpf_reg_type reg_type, u32 btf_id) 1386 + enum bpf_reg_type reg_type, 1387 + struct btf *btf, u32 btf_id) 1388 1388 { 1389 1389 if (reg_type == SCALAR_VALUE) { 1390 1390 mark_reg_unknown(env, regs, regno); ··· 1393 1391 } 1394 1392 mark_reg_known_zero(env, regs, regno); 1395 1393 regs[regno].type = PTR_TO_BTF_ID; 1394 + regs[regno].btf = btf; 1396 1395 regs[regno].btf_id = btf_id; 1397 1396 } 1398 1397 ··· 2767 2764 /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ 2768 2765 static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, 2769 2766 enum bpf_access_type t, enum bpf_reg_type *reg_type, 2770 - u32 *btf_id) 2767 + struct btf **btf, u32 *btf_id) 2771 2768 { 2772 2769 struct bpf_insn_access_aux info = { 2773 2770 .reg_type = *reg_type, ··· 2785 2782 */ 2786 2783 *reg_type = info.reg_type; 2787 2784 2788 - if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) 2785 + if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) { 2786 + *btf = info.btf; 2789 2787 *btf_id = info.btf_id; 2790 - else 2788 + } else { 2791 2789 env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; 2790 + } 2792 2791 /* remember the offset of last byte accessed in ctx */ 2793 2792 if (env->prog->aux->max_ctx_offset < off + size) 2794 2793 env->prog->aux->max_ctx_offset = off + size; ··· 3302 3297 int value_regno) 3303 3298 { 3304 3299 struct bpf_reg_state *reg = regs + regno; 3305 - const struct btf_type *t = btf_type_by_id(btf_vmlinux, reg->btf_id); 3306 - const char *tname = btf_name_by_offset(btf_vmlinux, t->name_off); 3300 + const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); 3301 + const char *tname = btf_name_by_offset(reg->btf, t->name_off); 3307 3302 u32 btf_id; 3308 3303 int ret; 3309 3304 ··· 3324 3319 } 3325 3320 3326 3321 if (env->ops->btf_struct_access) { 3327 - ret = env->ops->btf_struct_access(&env->log, t, off, size, 3328 - atype, &btf_id); 3322 + ret = env->ops->btf_struct_access(&env->log, reg->btf, t, 3323 + off, size, atype, &btf_id); 3329 3324 } else { 3330 3325 if (atype != BPF_READ) { 3331 3326 verbose(env, "only read is supported\n"); 3332 3327 return -EACCES; 3333 3328 } 3334 3329 3335 - ret = btf_struct_access(&env->log, t, off, size, atype, 3336 - &btf_id); 3330 + ret = btf_struct_access(&env->log, reg->btf, t, off, size, 3331 + atype, &btf_id); 3337 3332 } 3338 3333 3339 3334 if (ret < 0) 3340 3335 return ret; 3341 3336 3342 3337 if (atype == BPF_READ && value_regno >= 0) 3343 - mark_btf_ld_reg(env, regs, value_regno, ret, btf_id); 3338 + mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id); 3344 3339 3345 3340 return 0; 3346 3341 } ··· 3390 3385 return -EACCES; 3391 3386 } 3392 3387 3393 - ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id); 3388 + ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id); 3394 3389 if (ret < 0) 3395 3390 return ret; 3396 3391 3397 3392 if (value_regno >= 0) 3398 - mark_btf_ld_reg(env, regs, value_regno, ret, btf_id); 3393 + mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id); 3399 3394 3400 3395 return 0; 3401 3396 } ··· 3471 3466 mark_reg_unknown(env, regs, value_regno); 3472 3467 } else if (reg->type == PTR_TO_CTX) { 3473 3468 enum bpf_reg_type reg_type = SCALAR_VALUE; 3469 + struct btf *btf = NULL; 3474 3470 u32 btf_id = 0; 3475 3471 3476 3472 if (t == BPF_WRITE && value_regno >= 0 && ··· 3484 3478 if (err < 0) 3485 3479 return err; 3486 3480 3487 - err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf_id); 3481 + err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf, &btf_id); 3488 3482 if (err) 3489 3483 verbose_linfo(env, insn_idx, "; "); 3490 3484 if (!err && t == BPF_READ && value_regno >= 0) { ··· 3506 3500 */ 3507 3501 regs[value_regno].subreg_def = DEF_NOT_SUBREG; 3508 3502 if (reg_type == PTR_TO_BTF_ID || 3509 - reg_type == PTR_TO_BTF_ID_OR_NULL) 3503 + reg_type == PTR_TO_BTF_ID_OR_NULL) { 3504 + regs[value_regno].btf = btf; 3510 3505 regs[value_regno].btf_id = btf_id; 3506 + } 3511 3507 } 3512 3508 regs[value_regno].type = reg_type; 3513 3509 } ··· 4126 4118 arg_btf_id = compatible->btf_id; 4127 4119 } 4128 4120 4129 - if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id, 4130 - *arg_btf_id)) { 4121 + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, 4122 + btf_vmlinux, *arg_btf_id)) { 4131 4123 verbose(env, "R%d is of type %s but %s is expected\n", 4132 - regno, kernel_type_name(reg->btf_id), 4133 - kernel_type_name(*arg_btf_id)); 4124 + regno, kernel_type_name(reg->btf, reg->btf_id), 4125 + kernel_type_name(btf_vmlinux, *arg_btf_id)); 4134 4126 return -EACCES; 4135 4127 } 4136 4128 ··· 4252 4244 verbose(env, "Helper has invalid btf_id in R%d\n", regno); 4253 4245 return -EACCES; 4254 4246 } 4247 + meta->ret_btf = reg->btf; 4255 4248 meta->ret_btf_id = reg->btf_id; 4256 4249 } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { 4257 4250 if (meta->func_id == BPF_FUNC_spin_lock) { ··· 5199 5190 const struct btf_type *t; 5200 5191 5201 5192 mark_reg_known_zero(env, regs, BPF_REG_0); 5202 - t = btf_type_skip_modifiers(btf_vmlinux, meta.ret_btf_id, NULL); 5193 + t = btf_type_skip_modifiers(meta.ret_btf, meta.ret_btf_id, NULL); 5203 5194 if (!btf_type_is_struct(t)) { 5204 5195 u32 tsize; 5205 5196 const struct btf_type *ret; 5206 5197 const char *tname; 5207 5198 5208 5199 /* resolve the type size of ksym. */ 5209 - ret = btf_resolve_size(btf_vmlinux, t, &tsize); 5200 + ret = btf_resolve_size(meta.ret_btf, t, &tsize); 5210 5201 if (IS_ERR(ret)) { 5211 - tname = btf_name_by_offset(btf_vmlinux, t->name_off); 5202 + tname = btf_name_by_offset(meta.ret_btf, t->name_off); 5212 5203 verbose(env, "unable to resolve the size of type '%s': %ld\n", 5213 5204 tname, PTR_ERR(ret)); 5214 5205 return -EINVAL; ··· 5221 5212 regs[BPF_REG_0].type = 5222 5213 fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? 5223 5214 PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; 5215 + regs[BPF_REG_0].btf = meta.ret_btf; 5224 5216 regs[BPF_REG_0].btf_id = meta.ret_btf_id; 5225 5217 } 5226 5218 } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL || ··· 5238 5228 fn->ret_type, func_id_name(func_id), func_id); 5239 5229 return -EINVAL; 5240 5230 } 5231 + /* current BPF helper definitions are only coming from 5232 + * built-in code with type IDs from vmlinux BTF 5233 + */ 5234 + regs[BPF_REG_0].btf = btf_vmlinux; 5241 5235 regs[BPF_REG_0].btf_id = ret_btf_id; 5242 5236 } else { 5243 5237 verbose(env, "unknown return type %d of func %s#%d\n", ··· 5641 5627 if (reg_is_pkt_pointer(ptr_reg)) { 5642 5628 dst_reg->id = ++env->id_gen; 5643 5629 /* something was added to pkt_ptr, set range to zero */ 5644 - dst_reg->raw = 0; 5630 + memset(&dst_reg->raw, 0, sizeof(dst_reg->raw)); 5645 5631 } 5646 5632 break; 5647 5633 case BPF_SUB: ··· 5706 5692 dst_reg->id = ++env->id_gen; 5707 5693 /* something was added to pkt_ptr, set range to zero */ 5708 5694 if (smin_val < 0) 5709 - dst_reg->raw = 0; 5695 + memset(&dst_reg->raw, 0, sizeof(dst_reg->raw)); 5710 5696 } 5711 5697 break; 5712 5698 case BPF_AND: ··· 7758 7744 break; 7759 7745 case PTR_TO_BTF_ID: 7760 7746 case PTR_TO_PERCPU_BTF_ID: 7747 + dst_reg->btf = aux->btf_var.btf; 7761 7748 dst_reg->btf_id = aux->btf_var.btf_id; 7762 7749 break; 7763 7750 default: ··· 8073 8058 env->insn_aux_data[idx].prune_point = true; 8074 8059 } 8075 8060 8061 + enum { 8062 + DONE_EXPLORING = 0, 8063 + KEEP_EXPLORING = 1, 8064 + }; 8065 + 8076 8066 /* t, w, e - match pseudo-code above: 8077 8067 * t - index of current instruction 8078 8068 * w - next instruction ··· 8090 8070 int *insn_state = env->cfg.insn_state; 8091 8071 8092 8072 if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) 8093 - return 0; 8073 + return DONE_EXPLORING; 8094 8074 8095 8075 if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH)) 8096 - return 0; 8076 + return DONE_EXPLORING; 8097 8077 8098 8078 if (w < 0 || w >= env->prog->len) { 8099 8079 verbose_linfo(env, t, "%d: ", t); ··· 8112 8092 if (env->cfg.cur_stack >= env->prog->len) 8113 8093 return -E2BIG; 8114 8094 insn_stack[env->cfg.cur_stack++] = w; 8115 - return 1; 8095 + return KEEP_EXPLORING; 8116 8096 } else if ((insn_state[w] & 0xF0) == DISCOVERED) { 8117 8097 if (loop_ok && env->bpf_capable) 8118 - return 0; 8098 + return DONE_EXPLORING; 8119 8099 verbose_linfo(env, t, "%d: ", t); 8120 8100 verbose_linfo(env, w, "%d: ", w); 8121 8101 verbose(env, "back-edge from insn %d to %d\n", t, w); ··· 8127 8107 verbose(env, "insn state internal bug\n"); 8128 8108 return -EFAULT; 8129 8109 } 8130 - return 0; 8110 + return DONE_EXPLORING; 8111 + } 8112 + 8113 + /* Visits the instruction at index t and returns one of the following: 8114 + * < 0 - an error occurred 8115 + * DONE_EXPLORING - the instruction was fully explored 8116 + * KEEP_EXPLORING - there is still work to be done before it is fully explored 8117 + */ 8118 + static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) 8119 + { 8120 + struct bpf_insn *insns = env->prog->insnsi; 8121 + int ret; 8122 + 8123 + /* All non-branch instructions have a single fall-through edge. */ 8124 + if (BPF_CLASS(insns[t].code) != BPF_JMP && 8125 + BPF_CLASS(insns[t].code) != BPF_JMP32) 8126 + return push_insn(t, t + 1, FALLTHROUGH, env, false); 8127 + 8128 + switch (BPF_OP(insns[t].code)) { 8129 + case BPF_EXIT: 8130 + return DONE_EXPLORING; 8131 + 8132 + case BPF_CALL: 8133 + ret = push_insn(t, t + 1, FALLTHROUGH, env, false); 8134 + if (ret) 8135 + return ret; 8136 + 8137 + if (t + 1 < insn_cnt) 8138 + init_explored_state(env, t + 1); 8139 + if (insns[t].src_reg == BPF_PSEUDO_CALL) { 8140 + init_explored_state(env, t); 8141 + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, 8142 + env, false); 8143 + } 8144 + return ret; 8145 + 8146 + case BPF_JA: 8147 + if (BPF_SRC(insns[t].code) != BPF_K) 8148 + return -EINVAL; 8149 + 8150 + /* unconditional jump with single edge */ 8151 + ret = push_insn(t, t + insns[t].off + 1, FALLTHROUGH, env, 8152 + true); 8153 + if (ret) 8154 + return ret; 8155 + 8156 + /* unconditional jmp is not a good pruning point, 8157 + * but it's marked, since backtracking needs 8158 + * to record jmp history in is_state_visited(). 8159 + */ 8160 + init_explored_state(env, t + insns[t].off + 1); 8161 + /* tell verifier to check for equivalent states 8162 + * after every call and jump 8163 + */ 8164 + if (t + 1 < insn_cnt) 8165 + init_explored_state(env, t + 1); 8166 + 8167 + return ret; 8168 + 8169 + default: 8170 + /* conditional jump with two edges */ 8171 + init_explored_state(env, t); 8172 + ret = push_insn(t, t + 1, FALLTHROUGH, env, true); 8173 + if (ret) 8174 + return ret; 8175 + 8176 + return push_insn(t, t + insns[t].off + 1, BRANCH, env, true); 8177 + } 8131 8178 } 8132 8179 8133 8180 /* non-recursive depth-first-search to detect loops in BPF program ··· 8202 8115 */ 8203 8116 static int check_cfg(struct bpf_verifier_env *env) 8204 8117 { 8205 - struct bpf_insn *insns = env->prog->insnsi; 8206 8118 int insn_cnt = env->prog->len; 8207 8119 int *insn_stack, *insn_state; 8208 8120 int ret = 0; 8209 - int i, t; 8121 + int i; 8210 8122 8211 8123 insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); 8212 8124 if (!insn_state) ··· 8221 8135 insn_stack[0] = 0; /* 0 is the first instruction */ 8222 8136 env->cfg.cur_stack = 1; 8223 8137 8224 - peek_stack: 8225 - if (env->cfg.cur_stack == 0) 8226 - goto check_state; 8227 - t = insn_stack[env->cfg.cur_stack - 1]; 8138 + while (env->cfg.cur_stack > 0) { 8139 + int t = insn_stack[env->cfg.cur_stack - 1]; 8228 8140 8229 - if (BPF_CLASS(insns[t].code) == BPF_JMP || 8230 - BPF_CLASS(insns[t].code) == BPF_JMP32) { 8231 - u8 opcode = BPF_OP(insns[t].code); 8232 - 8233 - if (opcode == BPF_EXIT) { 8234 - goto mark_explored; 8235 - } else if (opcode == BPF_CALL) { 8236 - ret = push_insn(t, t + 1, FALLTHROUGH, env, false); 8237 - if (ret == 1) 8238 - goto peek_stack; 8239 - else if (ret < 0) 8240 - goto err_free; 8241 - if (t + 1 < insn_cnt) 8242 - init_explored_state(env, t + 1); 8243 - if (insns[t].src_reg == BPF_PSEUDO_CALL) { 8244 - init_explored_state(env, t); 8245 - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, 8246 - env, false); 8247 - if (ret == 1) 8248 - goto peek_stack; 8249 - else if (ret < 0) 8250 - goto err_free; 8141 + ret = visit_insn(t, insn_cnt, env); 8142 + switch (ret) { 8143 + case DONE_EXPLORING: 8144 + insn_state[t] = EXPLORED; 8145 + env->cfg.cur_stack--; 8146 + break; 8147 + case KEEP_EXPLORING: 8148 + break; 8149 + default: 8150 + if (ret > 0) { 8151 + verbose(env, "visit_insn internal bug\n"); 8152 + ret = -EFAULT; 8251 8153 } 8252 - } else if (opcode == BPF_JA) { 8253 - if (BPF_SRC(insns[t].code) != BPF_K) { 8254 - ret = -EINVAL; 8255 - goto err_free; 8256 - } 8257 - /* unconditional jump with single edge */ 8258 - ret = push_insn(t, t + insns[t].off + 1, 8259 - FALLTHROUGH, env, true); 8260 - if (ret == 1) 8261 - goto peek_stack; 8262 - else if (ret < 0) 8263 - goto err_free; 8264 - /* unconditional jmp is not a good pruning point, 8265 - * but it's marked, since backtracking needs 8266 - * to record jmp history in is_state_visited(). 8267 - */ 8268 - init_explored_state(env, t + insns[t].off + 1); 8269 - /* tell verifier to check for equivalent states 8270 - * after every call and jump 8271 - */ 8272 - if (t + 1 < insn_cnt) 8273 - init_explored_state(env, t + 1); 8274 - } else { 8275 - /* conditional jump with two edges */ 8276 - init_explored_state(env, t); 8277 - ret = push_insn(t, t + 1, FALLTHROUGH, env, true); 8278 - if (ret == 1) 8279 - goto peek_stack; 8280 - else if (ret < 0) 8281 - goto err_free; 8282 - 8283 - ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true); 8284 - if (ret == 1) 8285 - goto peek_stack; 8286 - else if (ret < 0) 8287 - goto err_free; 8288 - } 8289 - } else { 8290 - /* all other non-branch instructions with single 8291 - * fall-through edge 8292 - */ 8293 - ret = push_insn(t, t + 1, FALLTHROUGH, env, false); 8294 - if (ret == 1) 8295 - goto peek_stack; 8296 - else if (ret < 0) 8297 8154 goto err_free; 8155 + } 8298 8156 } 8299 8157 8300 - mark_explored: 8301 - insn_state[t] = EXPLORED; 8302 - if (env->cfg.cur_stack-- <= 0) { 8158 + if (env->cfg.cur_stack < 0) { 8303 8159 verbose(env, "pop stack internal bug\n"); 8304 8160 ret = -EFAULT; 8305 8161 goto err_free; 8306 8162 } 8307 - goto peek_stack; 8308 8163 8309 - check_state: 8310 8164 for (i = 0; i < insn_cnt; i++) { 8311 8165 if (insn_state[i] != EXPLORED) { 8312 8166 verbose(env, "unreachable insn %d\n", i); ··· 9766 9740 t = btf_type_skip_modifiers(btf_vmlinux, type, NULL); 9767 9741 if (percpu) { 9768 9742 aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; 9743 + aux->btf_var.btf = btf_vmlinux; 9769 9744 aux->btf_var.btf_id = type; 9770 9745 } else if (!btf_type_is_struct(t)) { 9771 9746 const struct btf_type *ret; ··· 9785 9758 aux->btf_var.mem_size = tsize; 9786 9759 } else { 9787 9760 aux->btf_var.reg_type = PTR_TO_BTF_ID; 9761 + aux->btf_var.btf = btf_vmlinux; 9788 9762 aux->btf_var.btf_id = type; 9789 9763 } 9790 9764 return 0; ··· 11638 11610 bpf_log(log, "Tracing programs must provide btf_id\n"); 11639 11611 return -EINVAL; 11640 11612 } 11641 - btf = tgt_prog ? tgt_prog->aux->btf : btf_vmlinux; 11613 + btf = tgt_prog ? tgt_prog->aux->btf : prog->aux->attach_btf; 11642 11614 if (!btf) { 11643 11615 bpf_log(log, 11644 11616 "FENTRY/FEXIT program can only be attached to another program annotated with BTF\n"); ··· 11914 11886 return ret; 11915 11887 } 11916 11888 11917 - key = bpf_trampoline_compute_key(tgt_prog, btf_id); 11889 + key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id); 11918 11890 tr = bpf_trampoline_get(key, &tgt_info); 11919 11891 if (!tr) 11920 11892 return -ENOMEM;

+4 -3

kernel/fork.c

··· 404 404 405 405 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { 406 406 /* 407 - * If memcg_kmem_charge_page() fails, page->mem_cgroup 408 - * pointer is NULL, and memcg_kmem_uncharge_page() in 409 - * free_thread_stack() will ignore this page. 407 + * If memcg_kmem_charge_page() fails, page's 408 + * memory cgroup pointer is NULL, and 409 + * memcg_kmem_uncharge_page() in free_thread_stack() 410 + * will ignore this page. 410 411 */ 411 412 ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 412 413 0);

+4

kernel/module.c

··· 3709 3709 mod->init_layout.ro_size = 0; 3710 3710 mod->init_layout.ro_after_init_size = 0; 3711 3711 mod->init_layout.text_size = 0; 3712 + #ifdef CONFIG_DEBUG_INFO_BTF_MODULES 3713 + /* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */ 3714 + mod->btf_data = NULL; 3715 + #endif 3712 3716 /* 3713 3717 * We want to free module_init, but be aware that kallsyms may be 3714 3718 * walking this with preempt disabled. In all the failure paths, we

+7 -3

kernel/trace/bpf_trace.c

··· 1290 1290 return &bpf_ktime_get_ns_proto; 1291 1291 case BPF_FUNC_ktime_get_boot_ns: 1292 1292 return &bpf_ktime_get_boot_ns_proto; 1293 + case BPF_FUNC_ktime_get_coarse_ns: 1294 + return &bpf_ktime_get_coarse_ns_proto; 1293 1295 case BPF_FUNC_tail_call: 1294 1296 return &bpf_tail_call_proto; 1295 1297 case BPF_FUNC_get_current_pid_tgid: ··· 2070 2068 2071 2069 void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) 2072 2070 { 2073 - struct module *mod = __module_address((unsigned long)btp); 2071 + struct module *mod; 2074 2072 2075 - if (mod) 2076 - module_put(mod); 2073 + preempt_disable(); 2074 + mod = __module_address((unsigned long)btp); 2075 + module_put(mod); 2076 + preempt_enable(); 2077 2077 } 2078 2078 2079 2079 static __always_inline

+2 -2

mm/debug.c

··· 182 182 pr_warn("page dumped because: %s\n", reason); 183 183 184 184 #ifdef CONFIG_MEMCG 185 - if (!page_poisoned && page->mem_cgroup) 186 - pr_warn("page->mem_cgroup:%px\n", page->mem_cgroup); 185 + if (!page_poisoned && page->memcg_data) 186 + pr_warn("pages's memcg:%lx\n", page->memcg_data); 187 187 #endif 188 188 } 189 189

+2 -2

mm/huge_memory.c

··· 470 470 #ifdef CONFIG_MEMCG 471 471 static inline struct deferred_split *get_deferred_split_queue(struct page *page) 472 472 { 473 - struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; 473 + struct mem_cgroup *memcg = page_memcg(compound_head(page)); 474 474 struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); 475 475 476 476 if (memcg) ··· 2764 2764 { 2765 2765 struct deferred_split *ds_queue = get_deferred_split_queue(page); 2766 2766 #ifdef CONFIG_MEMCG 2767 - struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; 2767 + struct mem_cgroup *memcg = page_memcg(compound_head(page)); 2768 2768 #endif 2769 2769 unsigned long flags; 2770 2770

+62 -77

mm/memcontrol.c

··· 533 533 { 534 534 struct mem_cgroup *memcg; 535 535 536 - memcg = page->mem_cgroup; 536 + memcg = page_memcg(page); 537 537 538 538 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 539 539 memcg = root_mem_cgroup; ··· 560 560 unsigned long ino = 0; 561 561 562 562 rcu_read_lock(); 563 - memcg = page->mem_cgroup; 564 - 565 - /* 566 - * The lowest bit set means that memcg isn't a valid 567 - * memcg pointer, but a obj_cgroups pointer. 568 - * In this case the page is shared and doesn't belong 569 - * to any specific memory cgroup. 570 - */ 571 - if ((unsigned long) memcg & 0x1UL) 572 - memcg = NULL; 563 + memcg = page_memcg_check(page); 573 564 574 565 while (memcg && !(memcg->css.flags & CSS_ONLINE)) 575 566 memcg = parent_mem_cgroup(memcg); ··· 1046 1055 */ 1047 1056 struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) 1048 1057 { 1049 - struct mem_cgroup *memcg = page->mem_cgroup; 1058 + struct mem_cgroup *memcg = page_memcg(page); 1050 1059 1051 1060 if (mem_cgroup_disabled()) 1052 1061 return NULL; ··· 1345 1354 goto out; 1346 1355 } 1347 1356 1348 - memcg = page->mem_cgroup; 1357 + memcg = page_memcg(page); 1349 1358 /* 1350 1359 * Swapcache readahead pages are added to the LRU - and 1351 1360 * possibly migrated - before they are charged. ··· 2105 2114 } 2106 2115 2107 2116 /** 2108 - * lock_page_memcg - lock a page->mem_cgroup binding 2117 + * lock_page_memcg - lock a page and memcg binding 2109 2118 * @page: the page 2110 2119 * 2111 2120 * This function protects unlocked LRU pages from being moved to ··· 2137 2146 if (mem_cgroup_disabled()) 2138 2147 return NULL; 2139 2148 again: 2140 - memcg = head->mem_cgroup; 2149 + memcg = page_memcg(head); 2141 2150 if (unlikely(!memcg)) 2142 2151 return NULL; 2143 2152 ··· 2145 2154 return memcg; 2146 2155 2147 2156 spin_lock_irqsave(&memcg->move_lock, flags); 2148 - if (memcg != head->mem_cgroup) { 2157 + if (memcg != page_memcg(head)) { 2149 2158 spin_unlock_irqrestore(&memcg->move_lock, flags); 2150 2159 goto again; 2151 2160 } ··· 2183 2192 } 2184 2193 2185 2194 /** 2186 - * unlock_page_memcg - unlock a page->mem_cgroup binding 2195 + * unlock_page_memcg - unlock a page and memcg binding 2187 2196 * @page: the page 2188 2197 */ 2189 2198 void unlock_page_memcg(struct page *page) 2190 2199 { 2191 2200 struct page *head = compound_head(page); 2192 2201 2193 - __unlock_page_memcg(head->mem_cgroup); 2202 + __unlock_page_memcg(page_memcg(head)); 2194 2203 } 2195 2204 EXPORT_SYMBOL(unlock_page_memcg); 2196 2205 ··· 2880 2889 2881 2890 static void commit_charge(struct page *page, struct mem_cgroup *memcg) 2882 2891 { 2883 - VM_BUG_ON_PAGE(page->mem_cgroup, page); 2892 + VM_BUG_ON_PAGE(page_memcg(page), page); 2884 2893 /* 2885 2894 * Any of the following ensures page->mem_cgroup stability: 2886 2895 * ··· 2889 2898 * - lock_page_memcg() 2890 2899 * - exclusive reference 2891 2900 */ 2892 - page->mem_cgroup = memcg; 2901 + page->memcg_data = (unsigned long)memcg; 2893 2902 } 2894 2903 2895 2904 #ifdef CONFIG_MEMCG_KMEM ··· 2904 2913 if (!vec) 2905 2914 return -ENOMEM; 2906 2915 2907 - if (cmpxchg(&page->obj_cgroups, NULL, 2908 - (struct obj_cgroup **) ((unsigned long)vec | 0x1UL))) 2916 + if (!set_page_objcgs(page, vec)) 2909 2917 kfree(vec); 2910 2918 else 2911 2919 kmemleak_not_leak(vec); ··· 2914 2924 2915 2925 /* 2916 2926 * Returns a pointer to the memory cgroup to which the kernel object is charged. 2927 + * 2928 + * A passed kernel object can be a slab object or a generic kernel page, so 2929 + * different mechanisms for getting the memory cgroup pointer should be used. 2930 + * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller 2931 + * can not know for sure how the kernel object is implemented. 2932 + * mem_cgroup_from_obj() can be safely used in such cases. 2917 2933 * 2918 2934 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), 2919 2935 * cgroup_mutex, etc. ··· 2934 2938 page = virt_to_head_page(p); 2935 2939 2936 2940 /* 2937 - * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer 2938 - * or a pointer to obj_cgroup vector. In the latter case the lowest 2939 - * bit of the pointer is set. 2940 - * The page->mem_cgroup pointer can be asynchronously changed 2941 - * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed 2942 - * from a valid memcg pointer to objcg vector or back. 2943 - */ 2944 - if (!page->mem_cgroup) 2945 - return NULL; 2946 - 2947 - /* 2948 2941 * Slab objects are accounted individually, not per-page. 2949 2942 * Memcg membership data for each individual object is saved in 2950 2943 * the page->obj_cgroups. 2951 2944 */ 2952 - if (page_has_obj_cgroups(page)) { 2945 + if (page_objcgs_check(page)) { 2953 2946 struct obj_cgroup *objcg; 2954 2947 unsigned int off; 2955 2948 2956 2949 off = obj_to_index(page->slab_cache, page, p); 2957 - objcg = page_obj_cgroups(page)[off]; 2950 + objcg = page_objcgs(page)[off]; 2958 2951 if (objcg) 2959 2952 return obj_cgroup_memcg(objcg); 2960 2953 2961 2954 return NULL; 2962 2955 } 2963 2956 2964 - /* All other pages use page->mem_cgroup */ 2965 - return page->mem_cgroup; 2957 + /* 2958 + * page_memcg_check() is used here, because page_has_obj_cgroups() 2959 + * check above could fail because the object cgroups vector wasn't set 2960 + * at that moment, but it can be set concurrently. 2961 + * page_memcg_check(page) will guarantee that a proper memory 2962 + * cgroup pointer or NULL will be returned. 2963 + */ 2964 + return page_memcg_check(page); 2966 2965 } 2967 2966 2968 2967 __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) ··· 3095 3104 if (memcg && !mem_cgroup_is_root(memcg)) { 3096 3105 ret = __memcg_kmem_charge(memcg, gfp, 1 << order); 3097 3106 if (!ret) { 3098 - page->mem_cgroup = memcg; 3099 - __SetPageKmemcg(page); 3107 + page->memcg_data = (unsigned long)memcg | 3108 + MEMCG_DATA_KMEM; 3100 3109 return 0; 3101 3110 } 3102 3111 css_put(&memcg->css); ··· 3111 3120 */ 3112 3121 void __memcg_kmem_uncharge_page(struct page *page, int order) 3113 3122 { 3114 - struct mem_cgroup *memcg = page->mem_cgroup; 3123 + struct mem_cgroup *memcg = page_memcg(page); 3115 3124 unsigned int nr_pages = 1 << order; 3116 3125 3117 3126 if (!memcg) ··· 3119 3128 3120 3129 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 3121 3130 __memcg_kmem_uncharge(memcg, nr_pages); 3122 - page->mem_cgroup = NULL; 3131 + page->memcg_data = 0; 3123 3132 css_put(&memcg->css); 3124 - 3125 - /* slab pages do not have PageKmemcg flag set */ 3126 - if (PageKmemcg(page)) 3127 - __ClearPageKmemcg(page); 3128 3133 } 3129 3134 3130 3135 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) ··· 3266 3279 */ 3267 3280 void mem_cgroup_split_huge_fixup(struct page *head) 3268 3281 { 3269 - struct mem_cgroup *memcg = head->mem_cgroup; 3282 + struct mem_cgroup *memcg = page_memcg(head); 3270 3283 int i; 3271 3284 3272 3285 if (mem_cgroup_disabled()) ··· 3274 3287 3275 3288 for (i = 1; i < HPAGE_PMD_NR; i++) { 3276 3289 css_get(&memcg->css); 3277 - head[i].mem_cgroup = memcg; 3290 + head[i].memcg_data = (unsigned long)memcg; 3278 3291 } 3279 3292 } 3280 3293 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ ··· 4656 4669 void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, 4657 4670 struct bdi_writeback *wb) 4658 4671 { 4659 - struct mem_cgroup *memcg = page->mem_cgroup; 4672 + struct mem_cgroup *memcg = page_memcg(page); 4660 4673 struct memcg_cgwb_frn *frn; 4661 4674 u64 now = get_jiffies_64(); 4662 4675 u64 oldest_at = now; ··· 5633 5646 5634 5647 /* 5635 5648 * Prevent mem_cgroup_migrate() from looking at 5636 - * page->mem_cgroup of its source page while we change it. 5649 + * page's memory cgroup of its source page while we change it. 5637 5650 */ 5638 5651 ret = -EBUSY; 5639 5652 if (!trylock_page(page)) 5640 5653 goto out; 5641 5654 5642 5655 ret = -EINVAL; 5643 - if (page->mem_cgroup != from) 5656 + if (page_memcg(page) != from) 5644 5657 goto out_unlock; 5645 5658 5646 5659 pgdat = page_pgdat(page); ··· 5695 5708 /* 5696 5709 * All state has been migrated, let's switch to the new memcg. 5697 5710 * 5698 - * It is safe to change page->mem_cgroup here because the page 5711 + * It is safe to change page's memcg here because the page 5699 5712 * is referenced, charged, isolated, and locked: we can't race 5700 5713 * with (un)charging, migration, LRU putback, or anything else 5701 - * that would rely on a stable page->mem_cgroup. 5714 + * that would rely on a stable page's memory cgroup. 5702 5715 * 5703 5716 * Note that lock_page_memcg is a memcg lock, not a page lock, 5704 - * to save space. As soon as we switch page->mem_cgroup to a 5717 + * to save space. As soon as we switch page's memory cgroup to a 5705 5718 * new memcg that isn't locked, the above state can change 5706 5719 * concurrently again. Make sure we're truly done with it. 5707 5720 */ ··· 5710 5723 css_get(&to->css); 5711 5724 css_put(&from->css); 5712 5725 5713 - page->mem_cgroup = to; 5726 + page->memcg_data = (unsigned long)to; 5714 5727 5715 5728 __unlock_page_memcg(from); 5716 5729 ··· 5776 5789 * mem_cgroup_move_account() checks the page is valid or 5777 5790 * not under LRU exclusion. 5778 5791 */ 5779 - if (page->mem_cgroup == mc.from) { 5792 + if (page_memcg(page) == mc.from) { 5780 5793 ret = MC_TARGET_PAGE; 5781 5794 if (is_device_private_page(page)) 5782 5795 ret = MC_TARGET_DEVICE; ··· 5820 5833 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 5821 5834 if (!(mc.flags & MOVE_ANON)) 5822 5835 return ret; 5823 - if (page->mem_cgroup == mc.from) { 5836 + if (page_memcg(page) == mc.from) { 5824 5837 ret = MC_TARGET_PAGE; 5825 5838 if (target) { 5826 5839 get_page(page); ··· 6766 6779 /* 6767 6780 * Every swap fault against a single page tries to charge the 6768 6781 * page, bail as early as possible. shmem_unuse() encounters 6769 - * already charged pages, too. page->mem_cgroup is protected 6770 - * by the page lock, which serializes swap cache removal, which 6771 - * in turn serializes uncharging. 6782 + * already charged pages, too. page and memcg binding is 6783 + * protected by the page lock, which serializes swap cache 6784 + * removal, which in turn serializes uncharging. 6772 6785 */ 6773 6786 VM_BUG_ON_PAGE(!PageLocked(page), page); 6774 - if (compound_head(page)->mem_cgroup) 6787 + if (page_memcg(compound_head(page))) 6775 6788 goto out; 6776 6789 6777 6790 id = lookup_swap_cgroup_id(ent); ··· 6855 6868 6856 6869 VM_BUG_ON_PAGE(PageLRU(page), page); 6857 6870 6858 - if (!page->mem_cgroup) 6871 + if (!page_memcg(page)) 6859 6872 return; 6860 6873 6861 6874 /* 6862 6875 * Nobody should be changing or seriously looking at 6863 - * page->mem_cgroup at this point, we have fully 6876 + * page_memcg(page) at this point, we have fully 6864 6877 * exclusive access to the page. 6865 6878 */ 6866 6879 6867 - if (ug->memcg != page->mem_cgroup) { 6880 + if (ug->memcg != page_memcg(page)) { 6868 6881 if (ug->memcg) { 6869 6882 uncharge_batch(ug); 6870 6883 uncharge_gather_clear(ug); 6871 6884 } 6872 - ug->memcg = page->mem_cgroup; 6885 + ug->memcg = page_memcg(page); 6873 6886 6874 6887 /* pairs with css_put in uncharge_batch */ 6875 6888 css_get(&ug->memcg->css); ··· 6878 6891 nr_pages = compound_nr(page); 6879 6892 ug->nr_pages += nr_pages; 6880 6893 6881 - if (!PageKmemcg(page)) { 6882 - ug->pgpgout++; 6883 - } else { 6894 + if (PageMemcgKmem(page)) 6884 6895 ug->nr_kmem += nr_pages; 6885 - __ClearPageKmemcg(page); 6886 - } 6896 + else 6897 + ug->pgpgout++; 6887 6898 6888 6899 ug->dummy_page = page; 6889 - page->mem_cgroup = NULL; 6900 + page->memcg_data = 0; 6890 6901 css_put(&ug->memcg->css); 6891 6902 } 6892 6903 ··· 6927 6942 return; 6928 6943 6929 6944 /* Don't touch page->lru of any random page, pre-check: */ 6930 - if (!page->mem_cgroup) 6945 + if (!page_memcg(page)) 6931 6946 return; 6932 6947 6933 6948 uncharge_gather_clear(&ug); ··· 6977 6992 return; 6978 6993 6979 6994 /* Page cache replacement: new page already charged? */ 6980 - if (newpage->mem_cgroup) 6995 + if (page_memcg(newpage)) 6981 6996 return; 6982 6997 6983 6998 /* Swapcache readahead pages can get replaced before being charged */ 6984 - memcg = oldpage->mem_cgroup; 6999 + memcg = page_memcg(oldpage); 6985 7000 if (!memcg) 6986 7001 return; 6987 7002 ··· 7176 7191 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7177 7192 return; 7178 7193 7179 - memcg = page->mem_cgroup; 7194 + memcg = page_memcg(page); 7180 7195 7181 7196 /* Readahead page, never charged */ 7182 7197 if (!memcg) ··· 7197 7212 VM_BUG_ON_PAGE(oldid, page); 7198 7213 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); 7199 7214 7200 - page->mem_cgroup = NULL; 7215 + page->memcg_data = 0; 7201 7216 7202 7217 if (!mem_cgroup_is_root(memcg)) 7203 7218 page_counter_uncharge(&memcg->memory, nr_entries); ··· 7240 7255 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7241 7256 return 0; 7242 7257 7243 - memcg = page->mem_cgroup; 7258 + memcg = page_memcg(page); 7244 7259 7245 7260 /* Readahead page, never charged */ 7246 7261 if (!memcg) ··· 7321 7336 if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7322 7337 return false; 7323 7338 7324 - memcg = page->mem_cgroup; 7339 + memcg = page_memcg(page); 7325 7340 if (!memcg) 7326 7341 return false; 7327 7342

+4 -4

mm/page_alloc.c

··· 1092 1092 if (unlikely((unsigned long)page->mapping | 1093 1093 page_ref_count(page) | 1094 1094 #ifdef CONFIG_MEMCG 1095 - (unsigned long)page->mem_cgroup | 1095 + (unsigned long)page_memcg(page) | 1096 1096 #endif 1097 1097 (page->flags & check_flags))) 1098 1098 return false; ··· 1117 1117 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; 1118 1118 } 1119 1119 #ifdef CONFIG_MEMCG 1120 - if (unlikely(page->mem_cgroup)) 1120 + if (unlikely(page_memcg(page))) 1121 1121 bad_reason = "page still charged to cgroup"; 1122 1122 #endif 1123 1123 return bad_reason; ··· 1214 1214 * Do not let hwpoison pages hit pcplists/buddy 1215 1215 * Untie memcg state and reset page's owner 1216 1216 */ 1217 - if (memcg_kmem_enabled() && PageKmemcg(page)) 1217 + if (memcg_kmem_enabled() && PageMemcgKmem(page)) 1218 1218 __memcg_kmem_uncharge_page(page, order); 1219 1219 reset_page_owner(page, order); 1220 1220 return false; ··· 1244 1244 } 1245 1245 if (PageMappingFlags(page)) 1246 1246 page->mapping = NULL; 1247 - if (memcg_kmem_enabled() && PageKmemcg(page)) 1247 + if (memcg_kmem_enabled() && PageMemcgKmem(page)) 1248 1248 __memcg_kmem_uncharge_page(page, order); 1249 1249 if (check_free) 1250 1250 bad += check_free_page(page);

+4 -2

mm/page_io.c

··· 291 291 static void bio_associate_blkg_from_page(struct bio *bio, struct page *page) 292 292 { 293 293 struct cgroup_subsys_state *css; 294 + struct mem_cgroup *memcg; 294 295 295 - if (!page->mem_cgroup) 296 + memcg = page_memcg(page); 297 + if (!memcg) 296 298 return; 297 299 298 300 rcu_read_lock(); 299 - css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); 301 + css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys); 300 302 bio_associate_blkg_from_css(bio, css); 301 303 rcu_read_unlock(); 302 304 }

+9 -29

mm/slab.h

··· 239 239 } 240 240 241 241 #ifdef CONFIG_MEMCG_KMEM 242 - static inline struct obj_cgroup **page_obj_cgroups(struct page *page) 243 - { 244 - /* 245 - * page->mem_cgroup and page->obj_cgroups are sharing the same 246 - * space. To distinguish between them in case we don't know for sure 247 - * that the page is a slab page (e.g. page_cgroup_ino()), let's 248 - * always set the lowest bit of obj_cgroups. 249 - */ 250 - return (struct obj_cgroup **) 251 - ((unsigned long)page->obj_cgroups & ~0x1UL); 252 - } 253 - 254 - static inline bool page_has_obj_cgroups(struct page *page) 255 - { 256 - return ((unsigned long)page->obj_cgroups & 0x1UL); 257 - } 258 - 259 242 int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, 260 243 gfp_t gfp); 261 244 262 245 static inline void memcg_free_page_obj_cgroups(struct page *page) 263 246 { 264 - kfree(page_obj_cgroups(page)); 265 - page->obj_cgroups = NULL; 247 + kfree(page_objcgs(page)); 248 + page->memcg_data = 0; 266 249 } 267 250 268 251 static inline size_t obj_full_size(struct kmem_cache *s) ··· 306 323 if (likely(p[i])) { 307 324 page = virt_to_head_page(p[i]); 308 325 309 - if (!page_has_obj_cgroups(page) && 326 + if (!page_objcgs(page) && 310 327 memcg_alloc_page_obj_cgroups(page, s, flags)) { 311 328 obj_cgroup_uncharge(objcg, obj_full_size(s)); 312 329 continue; ··· 314 331 315 332 off = obj_to_index(s, page, p[i]); 316 333 obj_cgroup_get(objcg); 317 - page_obj_cgroups(page)[off] = objcg; 334 + page_objcgs(page)[off] = objcg; 318 335 mod_objcg_state(objcg, page_pgdat(page), 319 336 cache_vmstat_idx(s), obj_full_size(s)); 320 337 } else { ··· 328 345 void **p, int objects) 329 346 { 330 347 struct kmem_cache *s; 348 + struct obj_cgroup **objcgs; 331 349 struct obj_cgroup *objcg; 332 350 struct page *page; 333 351 unsigned int off; ··· 342 358 continue; 343 359 344 360 page = virt_to_head_page(p[i]); 345 - if (!page_has_obj_cgroups(page)) 361 + objcgs = page_objcgs(page); 362 + if (!objcgs) 346 363 continue; 347 364 348 365 if (!s_orig) ··· 352 367 s = s_orig; 353 368 354 369 off = obj_to_index(s, page, p[i]); 355 - objcg = page_obj_cgroups(page)[off]; 370 + objcg = objcgs[off]; 356 371 if (!objcg) 357 372 continue; 358 373 359 - page_obj_cgroups(page)[off] = NULL; 374 + objcgs[off] = NULL; 360 375 obj_cgroup_uncharge(objcg, obj_full_size(s)); 361 376 mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s), 362 377 -obj_full_size(s)); ··· 365 380 } 366 381 367 382 #else /* CONFIG_MEMCG_KMEM */ 368 - static inline bool page_has_obj_cgroups(struct page *page) 369 - { 370 - return false; 371 - } 372 - 373 383 static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) 374 384 { 375 385 return NULL;

+1 -1

mm/workingset.c

··· 257 257 struct lruvec *lruvec; 258 258 int memcgid; 259 259 260 - /* Page is fully exclusive and pins page->mem_cgroup */ 260 + /* Page is fully exclusive and pins page's memory cgroup pointer */ 261 261 VM_BUG_ON_PAGE(PageLRU(page), page); 262 262 VM_BUG_ON_PAGE(page_count(page), page); 263 263 VM_BUG_ON_PAGE(!PageLocked(page), page);

+2 -2

net/core/bpf_sk_storage.c

··· 415 415 BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, 416 416 void *, value, u64, flags) 417 417 { 418 - if (!in_serving_softirq() && !in_task()) 418 + if (in_irq() || in_nmi()) 419 419 return (unsigned long)NULL; 420 420 421 421 return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags); ··· 424 424 BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, 425 425 struct sock *, sk) 426 426 { 427 - if (!in_serving_softirq() && !in_task()) 427 + if (in_irq() || in_nmi()) 428 428 return -EPERM; 429 429 430 430 return ____bpf_sk_storage_delete(map, sk);

+69 -22

net/core/dev.c

··· 6448 6448 6449 6449 WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); 6450 6450 6451 - new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED); 6451 + new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | 6452 + NAPIF_STATE_PREFER_BUSY_POLL); 6452 6453 6453 6454 /* If STATE_MISSED was set, leave STATE_SCHED set, 6454 6455 * because we will call napi->poll() one more time. ··· 6486 6485 6487 6486 #if defined(CONFIG_NET_RX_BUSY_POLL) 6488 6487 6489 - #define BUSY_POLL_BUDGET 8 6490 - 6491 - static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) 6488 + static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) 6492 6489 { 6490 + if (!skip_schedule) { 6491 + gro_normal_list(napi); 6492 + __napi_schedule(napi); 6493 + return; 6494 + } 6495 + 6496 + if (napi->gro_bitmask) { 6497 + /* flush too old packets 6498 + * If HZ < 1000, flush all packets. 6499 + */ 6500 + napi_gro_flush(napi, HZ >= 1000); 6501 + } 6502 + 6503 + gro_normal_list(napi); 6504 + clear_bit(NAPI_STATE_SCHED, &napi->state); 6505 + } 6506 + 6507 + static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll, 6508 + u16 budget) 6509 + { 6510 + bool skip_schedule = false; 6511 + unsigned long timeout; 6493 6512 int rc; 6494 6513 6495 6514 /* Busy polling means there is a high chance device driver hard irq ··· 6526 6505 6527 6506 local_bh_disable(); 6528 6507 6508 + if (prefer_busy_poll) { 6509 + napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); 6510 + timeout = READ_ONCE(napi->dev->gro_flush_timeout); 6511 + if (napi->defer_hard_irqs_count && timeout) { 6512 + hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); 6513 + skip_schedule = true; 6514 + } 6515 + } 6516 + 6529 6517 /* All we really want here is to re-enable device interrupts. 6530 6518 * Ideally, a new ndo_busy_poll_stop() could avoid another round. 6531 6519 */ 6532 - rc = napi->poll(napi, BUSY_POLL_BUDGET); 6520 + rc = napi->poll(napi, budget); 6533 6521 /* We can't gro_normal_list() here, because napi->poll() might have 6534 6522 * rearmed the napi (napi_complete_done()) in which case it could 6535 6523 * already be running on another CPU. 6536 6524 */ 6537 - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); 6525 + trace_napi_poll(napi, rc, budget); 6538 6526 netpoll_poll_unlock(have_poll_lock); 6539 - if (rc == BUSY_POLL_BUDGET) { 6540 - /* As the whole budget was spent, we still own the napi so can 6541 - * safely handle the rx_list. 6542 - */ 6543 - gro_normal_list(napi); 6544 - __napi_schedule(napi); 6545 - } 6527 + if (rc == budget) 6528 + __busy_poll_stop(napi, skip_schedule); 6546 6529 local_bh_enable(); 6547 6530 } 6548 6531 6549 6532 void napi_busy_loop(unsigned int napi_id, 6550 6533 bool (*loop_end)(void *, unsigned long), 6551 - void *loop_end_arg) 6534 + void *loop_end_arg, bool prefer_busy_poll, u16 budget) 6552 6535 { 6553 6536 unsigned long start_time = loop_end ? busy_loop_current_time() : 0; 6554 6537 int (*napi_poll)(struct napi_struct *napi, int budget); ··· 6580 6555 * we avoid dirtying napi->state as much as we can. 6581 6556 */ 6582 6557 if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | 6583 - NAPIF_STATE_IN_BUSY_POLL)) 6558 + NAPIF_STATE_IN_BUSY_POLL)) { 6559 + if (prefer_busy_poll) 6560 + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); 6584 6561 goto count; 6562 + } 6585 6563 if (cmpxchg(&napi->state, val, 6586 6564 val | NAPIF_STATE_IN_BUSY_POLL | 6587 - NAPIF_STATE_SCHED) != val) 6565 + NAPIF_STATE_SCHED) != val) { 6566 + if (prefer_busy_poll) 6567 + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); 6588 6568 goto count; 6569 + } 6589 6570 have_poll_lock = netpoll_poll_lock(napi); 6590 6571 napi_poll = napi->poll; 6591 6572 } 6592 - work = napi_poll(napi, BUSY_POLL_BUDGET); 6593 - trace_napi_poll(napi, work, BUSY_POLL_BUDGET); 6573 + work = napi_poll(napi, budget); 6574 + trace_napi_poll(napi, work, budget); 6594 6575 gro_normal_list(napi); 6595 6576 count: 6596 6577 if (work > 0) ··· 6609 6578 6610 6579 if (unlikely(need_resched())) { 6611 6580 if (napi_poll) 6612 - busy_poll_stop(napi, have_poll_lock); 6581 + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); 6613 6582 preempt_enable(); 6614 6583 rcu_read_unlock(); 6615 6584 cond_resched(); ··· 6620 6589 cpu_relax(); 6621 6590 } 6622 6591 if (napi_poll) 6623 - busy_poll_stop(napi, have_poll_lock); 6592 + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); 6624 6593 preempt_enable(); 6625 6594 out: 6626 6595 rcu_read_unlock(); ··· 6671 6640 * NAPI_STATE_MISSED, since we do not react to a device IRQ. 6672 6641 */ 6673 6642 if (!napi_disable_pending(napi) && 6674 - !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) 6643 + !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) { 6644 + clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); 6675 6645 __napi_schedule_irqoff(napi); 6646 + } 6676 6647 6677 6648 return HRTIMER_NORESTART; 6678 6649 } ··· 6732 6699 6733 6700 hrtimer_cancel(&n->timer); 6734 6701 6702 + clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); 6735 6703 clear_bit(NAPI_STATE_DISABLE, &n->state); 6736 6704 } 6737 6705 EXPORT_SYMBOL(napi_disable); ··· 6802 6768 */ 6803 6769 if (unlikely(napi_disable_pending(n))) { 6804 6770 napi_complete(n); 6771 + goto out_unlock; 6772 + } 6773 + 6774 + /* The NAPI context has more processing work, but busy-polling 6775 + * is preferred. Exit early. 6776 + */ 6777 + if (napi_prefer_busy_poll(n)) { 6778 + if (napi_complete_done(n, work)) { 6779 + /* If timeout is not set, we need to make sure 6780 + * that the NAPI is re-scheduled. 6781 + */ 6782 + napi_schedule(n); 6783 + } 6805 6784 goto out_unlock; 6806 6785 } 6807 6786 ··· 9800 9753 rx[i].dev = dev; 9801 9754 9802 9755 /* XDP RX-queue setup */ 9803 - err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i); 9756 + err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0); 9804 9757 if (err < 0) 9805 9758 goto err_rxq_info; 9806 9759 }

+7

net/core/filter.c

··· 4910 4910 tp->notsent_lowat = val; 4911 4911 sk->sk_write_space(sk); 4912 4912 break; 4913 + case TCP_WINDOW_CLAMP: 4914 + ret = tcp_set_window_clamp(sk, val); 4915 + break; 4913 4916 default: 4914 4917 ret = -EINVAL; 4915 4918 } ··· 6998 6995 return &bpf_sk_storage_delete_proto; 6999 6996 case BPF_FUNC_setsockopt: 7000 6997 switch (prog->expected_attach_type) { 6998 + case BPF_CGROUP_INET4_BIND: 6999 + case BPF_CGROUP_INET6_BIND: 7001 7000 case BPF_CGROUP_INET4_CONNECT: 7002 7001 case BPF_CGROUP_INET6_CONNECT: 7003 7002 return &bpf_sock_addr_setsockopt_proto; ··· 7008 7003 } 7009 7004 case BPF_FUNC_getsockopt: 7010 7005 switch (prog->expected_attach_type) { 7006 + case BPF_CGROUP_INET4_BIND: 7007 + case BPF_CGROUP_INET6_BIND: 7011 7008 case BPF_CGROUP_INET4_CONNECT: 7012 7009 case BPF_CGROUP_INET6_CONNECT: 7013 7010 return &bpf_sock_addr_getsockopt_proto;

+19

net/core/sock.c

··· 1159 1159 sk->sk_ll_usec = val; 1160 1160 } 1161 1161 break; 1162 + case SO_PREFER_BUSY_POLL: 1163 + if (valbool && !capable(CAP_NET_ADMIN)) 1164 + ret = -EPERM; 1165 + else 1166 + WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); 1167 + break; 1168 + case SO_BUSY_POLL_BUDGET: 1169 + if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { 1170 + ret = -EPERM; 1171 + } else { 1172 + if (val < 0 || val > U16_MAX) 1173 + ret = -EINVAL; 1174 + else 1175 + WRITE_ONCE(sk->sk_busy_poll_budget, val); 1176 + } 1177 + break; 1162 1178 #endif 1163 1179 1164 1180 case SO_MAX_PACING_RATE: ··· 1538 1522 #ifdef CONFIG_NET_RX_BUSY_POLL 1539 1523 case SO_BUSY_POLL: 1540 1524 v.val = sk->sk_ll_usec; 1525 + break; 1526 + case SO_PREFER_BUSY_POLL: 1527 + v.val = READ_ONCE(sk->sk_prefer_busy_poll); 1541 1528 break; 1542 1529 #endif 1543 1530

+11 -31

net/core/sock_map.c

··· 27 27 static struct bpf_map *sock_map_alloc(union bpf_attr *attr) 28 28 { 29 29 struct bpf_stab *stab; 30 - u64 cost; 31 - int err; 32 30 33 31 if (!capable(CAP_NET_ADMIN)) 34 32 return ERR_PTR(-EPERM); ··· 37 39 attr->map_flags & ~SOCK_CREATE_FLAG_MASK) 38 40 return ERR_PTR(-EINVAL); 39 41 40 - stab = kzalloc(sizeof(*stab), GFP_USER); 42 + stab = kzalloc(sizeof(*stab), GFP_USER | __GFP_ACCOUNT); 41 43 if (!stab) 42 44 return ERR_PTR(-ENOMEM); 43 45 44 46 bpf_map_init_from_attr(&stab->map, attr); 45 47 raw_spin_lock_init(&stab->lock); 46 48 47 - /* Make sure page count doesn't overflow. */ 48 - cost = (u64) stab->map.max_entries * sizeof(struct sock *); 49 - err = bpf_map_charge_init(&stab->map.memory, cost); 50 - if (err) 51 - goto free_stab; 52 - 53 49 stab->sks = bpf_map_area_alloc(stab->map.max_entries * 54 50 sizeof(struct sock *), 55 51 stab->map.numa_node); 56 - if (stab->sks) 57 - return &stab->map; 58 - err = -ENOMEM; 59 - bpf_map_charge_finish(&stab->map.memory); 60 - free_stab: 61 - kfree(stab); 62 - return ERR_PTR(err); 52 + if (!stab->sks) { 53 + kfree(stab); 54 + return ERR_PTR(-ENOMEM); 55 + } 56 + 57 + return &stab->map; 63 58 } 64 59 65 60 int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) ··· 966 975 } 967 976 } 968 977 969 - new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, 970 - htab->map.numa_node); 978 + new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, 979 + GFP_ATOMIC | __GFP_NOWARN, 980 + htab->map.numa_node); 971 981 if (!new) { 972 982 atomic_dec(&htab->count); 973 983 return ERR_PTR(-ENOMEM); ··· 1095 1103 { 1096 1104 struct bpf_shtab *htab; 1097 1105 int i, err; 1098 - u64 cost; 1099 1106 1100 1107 if (!capable(CAP_NET_ADMIN)) 1101 1108 return ERR_PTR(-EPERM); ··· 1107 1116 if (attr->key_size > MAX_BPF_STACK) 1108 1117 return ERR_PTR(-E2BIG); 1109 1118 1110 - htab = kzalloc(sizeof(*htab), GFP_USER); 1119 + htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT); 1111 1120 if (!htab) 1112 1121 return ERR_PTR(-ENOMEM); 1113 1122 ··· 1122 1131 goto free_htab; 1123 1132 } 1124 1133 1125 - cost = (u64) htab->buckets_num * sizeof(struct bpf_shtab_bucket) + 1126 - (u64) htab->elem_size * htab->map.max_entries; 1127 - if (cost >= U32_MAX - PAGE_SIZE) { 1128 - err = -EINVAL; 1129 - goto free_htab; 1130 - } 1131 - err = bpf_map_charge_init(&htab->map.memory, cost); 1132 - if (err) 1133 - goto free_htab; 1134 - 1135 1134 htab->buckets = bpf_map_area_alloc(htab->buckets_num * 1136 1135 sizeof(struct bpf_shtab_bucket), 1137 1136 htab->map.numa_node); 1138 1137 if (!htab->buckets) { 1139 - bpf_map_charge_finish(&htab->map.memory); 1140 1138 err = -ENOMEM; 1141 1139 goto free_htab; 1142 1140 }

+2 -1

net/core/xdp.c

··· 158 158 159 159 /* Returns 0 on success, negative on failure */ 160 160 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, 161 - struct net_device *dev, u32 queue_index) 161 + struct net_device *dev, u32 queue_index, unsigned int napi_id) 162 162 { 163 163 if (xdp_rxq->reg_state == REG_STATE_UNUSED) { 164 164 WARN(1, "Driver promised not to register this"); ··· 179 179 xdp_rxq_info_init(xdp_rxq); 180 180 xdp_rxq->dev = dev; 181 181 xdp_rxq->queue_index = queue_index; 182 + xdp_rxq->napi_id = napi_id; 182 183 183 184 xdp_rxq->reg_state = REG_STATE_REGISTERED; 184 185 return 0;

+1 -1

net/ipv4/af_inet.c

··· 450 450 /* BPF prog is run before any checks are done so that if the prog 451 451 * changes context in a wrong way it will be caught. 452 452 */ 453 - err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr); 453 + err = BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr); 454 454 if (err) 455 455 return err; 456 456

+2 -1

net/ipv4/bpf_tcp_ca.c

··· 95 95 } 96 96 97 97 static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, 98 + const struct btf *btf, 98 99 const struct btf_type *t, int off, 99 100 int size, enum bpf_access_type atype, 100 101 u32 *next_btf_id) ··· 103 102 size_t end; 104 103 105 104 if (atype == BPF_READ) 106 - return btf_struct_access(log, t, off, size, atype, next_btf_id); 105 + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id); 107 106 108 107 if (t != tcp_sock_type) { 109 108 bpf_log(log, "only read is supported\n");

+16 -9

net/ipv4/tcp.c

··· 3042 3042 } 3043 3043 EXPORT_SYMBOL(tcp_sock_set_keepcnt); 3044 3044 3045 + int tcp_set_window_clamp(struct sock *sk, int val) 3046 + { 3047 + struct tcp_sock *tp = tcp_sk(sk); 3048 + 3049 + if (!val) { 3050 + if (sk->sk_state != TCP_CLOSE) 3051 + return -EINVAL; 3052 + tp->window_clamp = 0; 3053 + } else { 3054 + tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? 3055 + SOCK_MIN_RCVBUF / 2 : val; 3056 + } 3057 + return 0; 3058 + } 3059 + 3045 3060 /* 3046 3061 * Socket option code for TCP. 3047 3062 */ ··· 3270 3255 break; 3271 3256 3272 3257 case TCP_WINDOW_CLAMP: 3273 - if (!val) { 3274 - if (sk->sk_state != TCP_CLOSE) { 3275 - err = -EINVAL; 3276 - break; 3277 - } 3278 - tp->window_clamp = 0; 3279 - } else 3280 - tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? 3281 - SOCK_MIN_RCVBUF / 2 : val; 3258 + err = tcp_set_window_clamp(sk, val); 3282 3259 break; 3283 3260 3284 3261 case TCP_QUICKACK:

+1 -1

net/ipv6/af_inet6.c

··· 451 451 /* BPF prog is run before any checks are done so that if the prog 452 452 * changes context in a wrong way it will be caught. 453 453 */ 454 - err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr); 454 + err = BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr); 455 455 if (err) 456 456 return err; 457 457

+110 -4

net/xdp/xsk.c

··· 23 23 #include <linux/netdevice.h> 24 24 #include <linux/rculist.h> 25 25 #include <net/xdp_sock_drv.h> 26 + #include <net/busy_poll.h> 26 27 #include <net/xdp.h> 27 28 28 29 #include "xsk_queue.h" ··· 233 232 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) 234 233 return -EINVAL; 235 234 235 + sk_mark_napi_id_once_xdp(&xs->sk, xdp); 236 236 len = xdp->data_end - xdp->data; 237 237 238 238 return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? ··· 333 331 return false; 334 332 } 335 333 EXPORT_SYMBOL(xsk_tx_peek_desc); 334 + 335 + static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_desc *descs, 336 + u32 max_entries) 337 + { 338 + u32 nb_pkts = 0; 339 + 340 + while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts])) 341 + nb_pkts++; 342 + 343 + xsk_tx_release(pool); 344 + return nb_pkts; 345 + } 346 + 347 + u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *descs, 348 + u32 max_entries) 349 + { 350 + struct xdp_sock *xs; 351 + u32 nb_pkts; 352 + 353 + rcu_read_lock(); 354 + if (!list_is_singular(&pool->xsk_tx_list)) { 355 + /* Fallback to the non-batched version */ 356 + rcu_read_unlock(); 357 + return xsk_tx_peek_release_fallback(pool, descs, max_entries); 358 + } 359 + 360 + xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list); 361 + if (!xs) { 362 + nb_pkts = 0; 363 + goto out; 364 + } 365 + 366 + nb_pkts = xskq_cons_peek_desc_batch(xs->tx, descs, pool, max_entries); 367 + if (!nb_pkts) { 368 + xs->tx->queue_empty_descs++; 369 + goto out; 370 + } 371 + 372 + /* This is the backpressure mechanism for the Tx path. Try to 373 + * reserve space in the completion queue for all packets, but 374 + * if there are fewer slots available, just process that many 375 + * packets. This avoids having to implement any buffering in 376 + * the Tx path. 377 + */ 378 + nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, descs, nb_pkts); 379 + if (!nb_pkts) 380 + goto out; 381 + 382 + xskq_cons_release_n(xs->tx, nb_pkts); 383 + __xskq_cons_release(xs->tx); 384 + xs->sk.sk_write_space(&xs->sk); 385 + 386 + out: 387 + rcu_read_unlock(); 388 + return nb_pkts; 389 + } 390 + EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch); 336 391 337 392 static int xsk_wakeup(struct xdp_sock *xs, u8 flags) 338 393 { ··· 513 454 return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk); 514 455 } 515 456 457 + static bool xsk_no_wakeup(struct sock *sk) 458 + { 459 + #ifdef CONFIG_NET_RX_BUSY_POLL 460 + /* Prefer busy-polling, skip the wakeup. */ 461 + return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) && 462 + READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID; 463 + #else 464 + return false; 465 + #endif 466 + } 467 + 516 468 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) 517 469 { 518 470 bool need_wait = !(m->msg_flags & MSG_DONTWAIT); 519 471 struct sock *sk = sock->sk; 520 472 struct xdp_sock *xs = xdp_sk(sk); 473 + struct xsk_buff_pool *pool; 521 474 522 475 if (unlikely(!xsk_is_bound(xs))) 523 476 return -ENXIO; 524 477 if (unlikely(need_wait)) 525 478 return -EOPNOTSUPP; 526 479 527 - return __xsk_sendmsg(sk); 480 + if (sk_can_busy_loop(sk)) 481 + sk_busy_loop(sk, 1); /* only support non-blocking sockets */ 482 + 483 + if (xsk_no_wakeup(sk)) 484 + return 0; 485 + 486 + pool = xs->pool; 487 + if (pool->cached_need_wakeup & XDP_WAKEUP_TX) 488 + return __xsk_sendmsg(sk); 489 + return 0; 490 + } 491 + 492 + static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) 493 + { 494 + bool need_wait = !(flags & MSG_DONTWAIT); 495 + struct sock *sk = sock->sk; 496 + struct xdp_sock *xs = xdp_sk(sk); 497 + 498 + if (unlikely(!(xs->dev->flags & IFF_UP))) 499 + return -ENETDOWN; 500 + if (unlikely(!xs->rx)) 501 + return -ENOBUFS; 502 + if (unlikely(!xsk_is_bound(xs))) 503 + return -ENXIO; 504 + if (unlikely(need_wait)) 505 + return -EOPNOTSUPP; 506 + 507 + if (sk_can_busy_loop(sk)) 508 + sk_busy_loop(sk, 1); /* only support non-blocking sockets */ 509 + 510 + if (xsk_no_wakeup(sk)) 511 + return 0; 512 + 513 + if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc) 514 + return xsk_wakeup(xs, XDP_WAKEUP_RX); 515 + return 0; 528 516 } 529 517 530 518 static __poll_t xsk_poll(struct file *file, struct socket *sock, ··· 648 542 node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, 649 543 node); 650 544 if (node) { 651 - WARN_ON(xsk_map_inc(node->map)); 545 + bpf_map_inc(&node->map->map); 652 546 map = node->map; 653 547 *map_entry = node->map_entry; 654 548 } ··· 678 572 679 573 while ((map = xsk_get_map_list_entry(xs, &map_entry))) { 680 574 xsk_map_try_sock_delete(map, xs, map_entry); 681 - xsk_map_put(map); 575 + bpf_map_put(&map->map); 682 576 } 683 577 } 684 578 ··· 1234 1128 .setsockopt = xsk_setsockopt, 1235 1129 .getsockopt = xsk_getsockopt, 1236 1130 .sendmsg = xsk_sendmsg, 1237 - .recvmsg = sock_no_recvmsg, 1131 + .recvmsg = xsk_recvmsg, 1238 1132 .mmap = xsk_mmap, 1239 1133 .sendpage = sock_no_sendpage, 1240 1134 };

-2

net/xdp/xsk.h

··· 41 41 42 42 void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, 43 43 struct xdp_sock **map_entry); 44 - int xsk_map_inc(struct xsk_map *map); 45 - void xsk_map_put(struct xsk_map *map); 46 44 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id); 47 45 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, 48 46 u16 queue_id);

+6 -7

net/xdp/xsk_buff_pool.c

··· 144 144 if (err) 145 145 return err; 146 146 147 - if (flags & XDP_USE_NEED_WAKEUP) { 147 + if (flags & XDP_USE_NEED_WAKEUP) 148 148 pool->uses_need_wakeup = true; 149 - /* Tx needs to be explicitly woken up the first time. 150 - * Also for supporting drivers that do not implement this 151 - * feature. They will always have to call sendto(). 152 - */ 153 - pool->cached_need_wakeup = XDP_WAKEUP_TX; 154 - } 149 + /* Tx needs to be explicitly woken up the first time. Also 150 + * for supporting drivers that do not implement this 151 + * feature. They will always have to call sendto() or poll(). 152 + */ 153 + pool->cached_need_wakeup = XDP_WAKEUP_TX; 155 154 156 155 dev_hold(netdev); 157 156

+79 -14

net/xdp/xsk_queue.h

··· 18 18 /* Hinder the adjacent cache prefetcher to prefetch the consumer 19 19 * pointer if the producer pointer is touched and vice versa. 20 20 */ 21 - u32 pad ____cacheline_aligned_in_smp; 21 + u32 pad1 ____cacheline_aligned_in_smp; 22 22 u32 consumer ____cacheline_aligned_in_smp; 23 + u32 pad2 ____cacheline_aligned_in_smp; 23 24 u32 flags; 25 + u32 pad3 ____cacheline_aligned_in_smp; 24 26 }; 25 27 26 28 /* Used for the RX and TX queues for packets */ ··· 199 197 return false; 200 198 } 201 199 200 + static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, 201 + struct xdp_desc *descs, 202 + struct xsk_buff_pool *pool, u32 max) 203 + { 204 + u32 cached_cons = q->cached_cons, nb_entries = 0; 205 + 206 + while (cached_cons != q->cached_prod && nb_entries < max) { 207 + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; 208 + u32 idx = cached_cons & q->ring_mask; 209 + 210 + descs[nb_entries] = ring->desc[idx]; 211 + if (unlikely(!xskq_cons_is_valid_desc(q, &descs[nb_entries], pool))) { 212 + /* Skip the entry */ 213 + cached_cons++; 214 + continue; 215 + } 216 + 217 + nb_entries++; 218 + cached_cons++; 219 + } 220 + 221 + return nb_entries; 222 + } 223 + 202 224 /* Functions for consumers */ 203 225 204 226 static inline void __xskq_cons_release(struct xsk_queue *q) ··· 244 218 __xskq_cons_peek(q); 245 219 } 246 220 247 - static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) 221 + static inline u32 xskq_cons_nb_entries(struct xsk_queue *q, u32 max) 248 222 { 249 223 u32 entries = q->cached_prod - q->cached_cons; 250 224 251 - if (entries >= cnt) 252 - return true; 225 + if (entries >= max) 226 + return max; 253 227 254 228 __xskq_cons_peek(q); 255 229 entries = q->cached_prod - q->cached_cons; 256 230 257 - return entries >= cnt; 231 + return entries >= max ? max : entries; 232 + } 233 + 234 + static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) 235 + { 236 + return xskq_cons_nb_entries(q, cnt) >= cnt ? true : false; 258 237 } 259 238 260 239 static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) ··· 278 247 return xskq_cons_read_desc(q, desc, pool); 279 248 } 280 249 250 + static inline u32 xskq_cons_peek_desc_batch(struct xsk_queue *q, struct xdp_desc *descs, 251 + struct xsk_buff_pool *pool, u32 max) 252 + { 253 + u32 entries = xskq_cons_nb_entries(q, max); 254 + 255 + return xskq_cons_read_desc_batch(q, descs, pool, entries); 256 + } 257 + 258 + /* To improve performance in the xskq_cons_release functions, only update local state here. 259 + * Reflect this to global state when we get new entries from the ring in 260 + * xskq_cons_get_entries() and whenever Rx or Tx processing are completed in the NAPI loop. 261 + */ 281 262 static inline void xskq_cons_release(struct xsk_queue *q) 282 263 { 283 - /* To improve performance, only update local state here. 284 - * Reflect this to global state when we get new entries 285 - * from the ring in xskq_cons_get_entries() and whenever 286 - * Rx or Tx processing are completed in the NAPI loop. 287 - */ 288 264 q->cached_cons++; 265 + } 266 + 267 + static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) 268 + { 269 + q->cached_cons += cnt; 289 270 } 290 271 291 272 static inline bool xskq_cons_is_full(struct xsk_queue *q) ··· 309 266 310 267 /* Functions for producers */ 311 268 312 - static inline bool xskq_prod_is_full(struct xsk_queue *q) 269 + static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max) 313 270 { 314 271 u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); 315 272 316 - if (free_entries) 317 - return false; 273 + if (free_entries >= max) 274 + return max; 318 275 319 276 /* Refresh the local tail pointer */ 320 277 q->cached_cons = READ_ONCE(q->ring->consumer); 321 278 free_entries = q->nentries - (q->cached_prod - q->cached_cons); 322 279 323 - return !free_entries; 280 + return free_entries >= max ? max : free_entries; 281 + } 282 + 283 + static inline bool xskq_prod_is_full(struct xsk_queue *q) 284 + { 285 + return xskq_prod_nb_free(q, 1) ? false : true; 324 286 } 325 287 326 288 static inline int xskq_prod_reserve(struct xsk_queue *q) ··· 348 300 /* A, matches D */ 349 301 ring->desc[q->cached_prod++ & q->ring_mask] = addr; 350 302 return 0; 303 + } 304 + 305 + static inline u32 xskq_prod_reserve_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, 306 + u32 max) 307 + { 308 + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; 309 + u32 nb_entries, i, cached_prod; 310 + 311 + nb_entries = xskq_prod_nb_free(q, max); 312 + 313 + /* A, matches D */ 314 + cached_prod = q->cached_prod; 315 + for (i = 0; i < nb_entries; i++) 316 + ring->desc[cached_prod++ & q->ring_mask] = descs[i].addr; 317 + q->cached_prod = cached_prod; 318 + 319 + return nb_entries; 351 320 } 352 321 353 322 static inline int xskq_prod_reserve_desc(struct xsk_queue *q,

+6 -29

net/xdp/xskmap.c

··· 11 11 12 12 #include "xsk.h" 13 13 14 - int xsk_map_inc(struct xsk_map *map) 15 - { 16 - bpf_map_inc(&map->map); 17 - return 0; 18 - } 19 - 20 - void xsk_map_put(struct xsk_map *map) 21 - { 22 - bpf_map_put(&map->map); 23 - } 24 - 25 14 static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, 26 15 struct xdp_sock **map_entry) 27 16 { 28 17 struct xsk_map_node *node; 29 - int err; 30 18 31 - node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); 19 + node = bpf_map_kzalloc(&map->map, sizeof(*node), 20 + GFP_ATOMIC | __GFP_NOWARN); 32 21 if (!node) 33 22 return ERR_PTR(-ENOMEM); 34 23 35 - err = xsk_map_inc(map); 36 - if (err) { 37 - kfree(node); 38 - return ERR_PTR(err); 39 - } 24 + bpf_map_inc(&map->map); 40 25 41 26 node->map = map; 42 27 node->map_entry = map_entry; ··· 30 45 31 46 static void xsk_map_node_free(struct xsk_map_node *node) 32 47 { 33 - xsk_map_put(node->map); 48 + bpf_map_put(&node->map->map); 34 49 kfree(node); 35 50 } 36 51 ··· 58 73 59 74 static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) 60 75 { 61 - struct bpf_map_memory mem; 62 - int err, numa_node; 63 76 struct xsk_map *m; 77 + int numa_node; 64 78 u64 size; 65 79 66 80 if (!capable(CAP_NET_ADMIN)) ··· 73 89 numa_node = bpf_map_attr_numa_node(attr); 74 90 size = struct_size(m, xsk_map, attr->max_entries); 75 91 76 - err = bpf_map_charge_init(&mem, size); 77 - if (err < 0) 78 - return ERR_PTR(err); 79 - 80 92 m = bpf_map_area_alloc(size, numa_node); 81 - if (!m) { 82 - bpf_map_charge_finish(&mem); 93 + if (!m) 83 94 return ERR_PTR(-ENOMEM); 84 - } 85 95 86 96 bpf_map_init_from_attr(&m->map, attr); 87 - bpf_map_charge_move(&m->map.memory, &mem); 88 97 spin_lock_init(&m->lock); 89 98 90 99 return &m->map;

+3

samples/bpf/.gitignore

··· 52 52 xdpsock 53 53 xsk_fwd 54 54 testfile.img 55 + hbm_out.log 56 + iperf.* 57 + *.out

+12 -12

samples/bpf/Makefile

··· 48 48 tprogs-y += cpustat 49 49 tprogs-y += xdp_adjust_tail 50 50 tprogs-y += xdpsock 51 + tprogs-y += xdpsock_ctrl_proc 51 52 tprogs-y += xsk_fwd 52 53 tprogs-y += xdp_fwd 53 54 tprogs-y += task_fd_query ··· 74 73 tracex6-objs := tracex6_user.o 75 74 tracex7-objs := tracex7_user.o 76 75 test_probe_write_user-objs := test_probe_write_user_user.o 77 - trace_output-objs := trace_output_user.o $(TRACE_HELPERS) 76 + trace_output-objs := trace_output_user.o 78 77 lathist-objs := lathist_user.o 79 78 offwaketime-objs := offwaketime_user.o $(TRACE_HELPERS) 80 79 spintest-objs := spintest_user.o $(TRACE_HELPERS) 81 80 map_perf_test-objs := map_perf_test_user.o 82 - test_overhead-objs := bpf_load.o test_overhead_user.o 81 + test_overhead-objs := test_overhead_user.o 83 82 test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o 84 83 test_cgrp2_attach-objs := test_cgrp2_attach.o 85 84 test_cgrp2_sock-objs := test_cgrp2_sock.o 86 - test_cgrp2_sock2-objs := bpf_load.o test_cgrp2_sock2.o 85 + test_cgrp2_sock2-objs := test_cgrp2_sock2.o 87 86 xdp1-objs := xdp1_user.o 88 87 # reuse xdp1 source intentionally 89 88 xdp2-objs := xdp1_user.o ··· 92 91 test_current_task_under_cgroup_user.o 93 92 trace_event-objs := trace_event_user.o $(TRACE_HELPERS) 94 93 sampleip-objs := sampleip_user.o $(TRACE_HELPERS) 95 - tc_l2_redirect-objs := bpf_load.o tc_l2_redirect_user.o 96 - lwt_len_hist-objs := bpf_load.o lwt_len_hist_user.o 94 + tc_l2_redirect-objs := tc_l2_redirect_user.o 95 + lwt_len_hist-objs := lwt_len_hist_user.o 97 96 xdp_tx_iptunnel-objs := xdp_tx_iptunnel_user.o 98 97 test_map_in_map-objs := test_map_in_map_user.o 99 98 per_socket_stats_example-objs := cookie_uid_helper_example.o ··· 106 105 cpustat-objs := cpustat_user.o 107 106 xdp_adjust_tail-objs := xdp_adjust_tail_user.o 108 107 xdpsock-objs := xdpsock_user.o 108 + xdpsock_ctrl_proc-objs := xdpsock_ctrl_proc.o 109 109 xsk_fwd-objs := xsk_fwd.o 110 110 xdp_fwd-objs := xdp_fwd_user.o 111 - task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) 112 - xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) 113 - ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS) 114 - hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS) 111 + task_fd_query-objs := task_fd_query_user.o $(TRACE_HELPERS) 112 + xdp_sample_pkts-objs := xdp_sample_pkts_user.o 113 + ibumad-objs := ibumad_user.o 114 + hbm-objs := hbm.o $(CGROUP_HELPERS) 115 115 116 116 # Tell kbuild to always build the programs 117 117 always-y := $(tprogs-y) ··· 199 197 TPROGS_LDFLAGS := -L$(SYSROOT)/usr/lib 200 198 endif 201 199 202 - TPROGCFLAGS_bpf_load.o += -Wno-unused-variable 203 - 204 200 TPROGS_LDLIBS += $(LIBBPF) -lelf -lz 205 201 TPROGLDLIBS_tracex4 += -lrt 206 202 TPROGLDLIBS_trace_output += -lrt 207 203 TPROGLDLIBS_map_perf_test += -lrt 208 204 TPROGLDLIBS_test_overhead += -lrt 209 - TPROGLDLIBS_xdpsock += -pthread 205 + TPROGLDLIBS_xdpsock += -pthread -lcap 210 206 TPROGLDLIBS_xsk_fwd += -pthread 211 207 212 208 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:

-667

samples/bpf/bpf_load.c

··· 1 - // SPDX-License-Identifier: GPL-2.0 2 - #include <stdio.h> 3 - #include <sys/types.h> 4 - #include <sys/stat.h> 5 - #include <fcntl.h> 6 - #include <libelf.h> 7 - #include <gelf.h> 8 - #include <errno.h> 9 - #include <unistd.h> 10 - #include <string.h> 11 - #include <stdbool.h> 12 - #include <stdlib.h> 13 - #include <linux/bpf.h> 14 - #include <linux/filter.h> 15 - #include <linux/perf_event.h> 16 - #include <linux/netlink.h> 17 - #include <linux/rtnetlink.h> 18 - #include <linux/types.h> 19 - #include <sys/socket.h> 20 - #include <sys/syscall.h> 21 - #include <sys/ioctl.h> 22 - #include <sys/mman.h> 23 - #include <poll.h> 24 - #include <ctype.h> 25 - #include <assert.h> 26 - #include <bpf/bpf.h> 27 - #include "bpf_load.h" 28 - #include "perf-sys.h" 29 - 30 - #define DEBUGFS "/sys/kernel/debug/tracing/" 31 - 32 - static char license[128]; 33 - static int kern_version; 34 - static bool processed_sec[128]; 35 - char bpf_log_buf[BPF_LOG_BUF_SIZE]; 36 - int map_fd[MAX_MAPS]; 37 - int prog_fd[MAX_PROGS]; 38 - int event_fd[MAX_PROGS]; 39 - int prog_cnt; 40 - int prog_array_fd = -1; 41 - 42 - struct bpf_map_data map_data[MAX_MAPS]; 43 - int map_data_count; 44 - 45 - static int populate_prog_array(const char *event, int prog_fd) 46 - { 47 - int ind = atoi(event), err; 48 - 49 - err = bpf_map_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY); 50 - if (err < 0) { 51 - printf("failed to store prog_fd in prog_array\n"); 52 - return -1; 53 - } 54 - return 0; 55 - } 56 - 57 - static int write_kprobe_events(const char *val) 58 - { 59 - int fd, ret, flags; 60 - 61 - if (val == NULL) 62 - return -1; 63 - else if (val[0] == '\0') 64 - flags = O_WRONLY | O_TRUNC; 65 - else 66 - flags = O_WRONLY | O_APPEND; 67 - 68 - fd = open(DEBUGFS "kprobe_events", flags); 69 - 70 - ret = write(fd, val, strlen(val)); 71 - close(fd); 72 - 73 - return ret; 74 - } 75 - 76 - static int load_and_attach(const char *event, struct bpf_insn *prog, int size) 77 - { 78 - bool is_socket = strncmp(event, "socket", 6) == 0; 79 - bool is_kprobe = strncmp(event, "kprobe/", 7) == 0; 80 - bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; 81 - bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0; 82 - bool is_raw_tracepoint = strncmp(event, "raw_tracepoint/", 15) == 0; 83 - bool is_xdp = strncmp(event, "xdp", 3) == 0; 84 - bool is_perf_event = strncmp(event, "perf_event", 10) == 0; 85 - bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0; 86 - bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0; 87 - bool is_sockops = strncmp(event, "sockops", 7) == 0; 88 - bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0; 89 - bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0; 90 - size_t insns_cnt = size / sizeof(struct bpf_insn); 91 - enum bpf_prog_type prog_type; 92 - char buf[256]; 93 - int fd, efd, err, id; 94 - struct perf_event_attr attr = {}; 95 - 96 - attr.type = PERF_TYPE_TRACEPOINT; 97 - attr.sample_type = PERF_SAMPLE_RAW; 98 - attr.sample_period = 1; 99 - attr.wakeup_events = 1; 100 - 101 - if (is_socket) { 102 - prog_type = BPF_PROG_TYPE_SOCKET_FILTER; 103 - } else if (is_kprobe || is_kretprobe) { 104 - prog_type = BPF_PROG_TYPE_KPROBE; 105 - } else if (is_tracepoint) { 106 - prog_type = BPF_PROG_TYPE_TRACEPOINT; 107 - } else if (is_raw_tracepoint) { 108 - prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT; 109 - } else if (is_xdp) { 110 - prog_type = BPF_PROG_TYPE_XDP; 111 - } else if (is_perf_event) { 112 - prog_type = BPF_PROG_TYPE_PERF_EVENT; 113 - } else if (is_cgroup_skb) { 114 - prog_type = BPF_PROG_TYPE_CGROUP_SKB; 115 - } else if (is_cgroup_sk) { 116 - prog_type = BPF_PROG_TYPE_CGROUP_SOCK; 117 - } else if (is_sockops) { 118 - prog_type = BPF_PROG_TYPE_SOCK_OPS; 119 - } else if (is_sk_skb) { 120 - prog_type = BPF_PROG_TYPE_SK_SKB; 121 - } else if (is_sk_msg) { 122 - prog_type = BPF_PROG_TYPE_SK_MSG; 123 - } else { 124 - printf("Unknown event '%s'\n", event); 125 - return -1; 126 - } 127 - 128 - if (prog_cnt == MAX_PROGS) 129 - return -1; 130 - 131 - fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version, 132 - bpf_log_buf, BPF_LOG_BUF_SIZE); 133 - if (fd < 0) { 134 - printf("bpf_load_program() err=%d\n%s", errno, bpf_log_buf); 135 - return -1; 136 - } 137 - 138 - prog_fd[prog_cnt++] = fd; 139 - 140 - if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk) 141 - return 0; 142 - 143 - if (is_socket || is_sockops || is_sk_skb || is_sk_msg) { 144 - if (is_socket) 145 - event += 6; 146 - else 147 - event += 7; 148 - if (*event != '/') 149 - return 0; 150 - event++; 151 - if (!isdigit(*event)) { 152 - printf("invalid prog number\n"); 153 - return -1; 154 - } 155 - return populate_prog_array(event, fd); 156 - } 157 - 158 - if (is_raw_tracepoint) { 159 - efd = bpf_raw_tracepoint_open(event + 15, fd); 160 - if (efd < 0) { 161 - printf("tracepoint %s %s\n", event + 15, strerror(errno)); 162 - return -1; 163 - } 164 - event_fd[prog_cnt - 1] = efd; 165 - return 0; 166 - } 167 - 168 - if (is_kprobe || is_kretprobe) { 169 - bool need_normal_check = true; 170 - const char *event_prefix = ""; 171 - 172 - if (is_kprobe) 173 - event += 7; 174 - else 175 - event += 10; 176 - 177 - if (*event == 0) { 178 - printf("event name cannot be empty\n"); 179 - return -1; 180 - } 181 - 182 - if (isdigit(*event)) 183 - return populate_prog_array(event, fd); 184 - 185 - #ifdef __x86_64__ 186 - if (strncmp(event, "sys_", 4) == 0) { 187 - snprintf(buf, sizeof(buf), "%c:__x64_%s __x64_%s", 188 - is_kprobe ? 'p' : 'r', event, event); 189 - err = write_kprobe_events(buf); 190 - if (err >= 0) { 191 - need_normal_check = false; 192 - event_prefix = "__x64_"; 193 - } 194 - } 195 - #endif 196 - if (need_normal_check) { 197 - snprintf(buf, sizeof(buf), "%c:%s %s", 198 - is_kprobe ? 'p' : 'r', event, event); 199 - err = write_kprobe_events(buf); 200 - if (err < 0) { 201 - printf("failed to create kprobe '%s' error '%s'\n", 202 - event, strerror(errno)); 203 - return -1; 204 - } 205 - } 206 - 207 - strcpy(buf, DEBUGFS); 208 - strcat(buf, "events/kprobes/"); 209 - strcat(buf, event_prefix); 210 - strcat(buf, event); 211 - strcat(buf, "/id"); 212 - } else if (is_tracepoint) { 213 - event += 11; 214 - 215 - if (*event == 0) { 216 - printf("event name cannot be empty\n"); 217 - return -1; 218 - } 219 - strcpy(buf, DEBUGFS); 220 - strcat(buf, "events/"); 221 - strcat(buf, event); 222 - strcat(buf, "/id"); 223 - } 224 - 225 - efd = open(buf, O_RDONLY, 0); 226 - if (efd < 0) { 227 - printf("failed to open event %s\n", event); 228 - return -1; 229 - } 230 - 231 - err = read(efd, buf, sizeof(buf)); 232 - if (err < 0 || err >= sizeof(buf)) { 233 - printf("read from '%s' failed '%s'\n", event, strerror(errno)); 234 - return -1; 235 - } 236 - 237 - close(efd); 238 - 239 - buf[err] = 0; 240 - id = atoi(buf); 241 - attr.config = id; 242 - 243 - efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0); 244 - if (efd < 0) { 245 - printf("event %d fd %d err %s\n", id, efd, strerror(errno)); 246 - return -1; 247 - } 248 - event_fd[prog_cnt - 1] = efd; 249 - err = ioctl(efd, PERF_EVENT_IOC_ENABLE, 0); 250 - if (err < 0) { 251 - printf("ioctl PERF_EVENT_IOC_ENABLE failed err %s\n", 252 - strerror(errno)); 253 - return -1; 254 - } 255 - err = ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd); 256 - if (err < 0) { 257 - printf("ioctl PERF_EVENT_IOC_SET_BPF failed err %s\n", 258 - strerror(errno)); 259 - return -1; 260 - } 261 - 262 - return 0; 263 - } 264 - 265 - static int load_maps(struct bpf_map_data *maps, int nr_maps, 266 - fixup_map_cb fixup_map) 267 - { 268 - int i, numa_node; 269 - 270 - for (i = 0; i < nr_maps; i++) { 271 - if (fixup_map) { 272 - fixup_map(&maps[i], i); 273 - /* Allow userspace to assign map FD prior to creation */ 274 - if (maps[i].fd != -1) { 275 - map_fd[i] = maps[i].fd; 276 - continue; 277 - } 278 - } 279 - 280 - numa_node = maps[i].def.map_flags & BPF_F_NUMA_NODE ? 281 - maps[i].def.numa_node : -1; 282 - 283 - if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS || 284 - maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) { 285 - int inner_map_fd = map_fd[maps[i].def.inner_map_idx]; 286 - 287 - map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type, 288 - maps[i].name, 289 - maps[i].def.key_size, 290 - inner_map_fd, 291 - maps[i].def.max_entries, 292 - maps[i].def.map_flags, 293 - numa_node); 294 - } else { 295 - map_fd[i] = bpf_create_map_node(maps[i].def.type, 296 - maps[i].name, 297 - maps[i].def.key_size, 298 - maps[i].def.value_size, 299 - maps[i].def.max_entries, 300 - maps[i].def.map_flags, 301 - numa_node); 302 - } 303 - if (map_fd[i] < 0) { 304 - printf("failed to create map %d (%s): %d %s\n", 305 - i, maps[i].name, errno, strerror(errno)); 306 - return 1; 307 - } 308 - maps[i].fd = map_fd[i]; 309 - 310 - if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY) 311 - prog_array_fd = map_fd[i]; 312 - } 313 - return 0; 314 - } 315 - 316 - static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname, 317 - GElf_Shdr *shdr, Elf_Data **data) 318 - { 319 - Elf_Scn *scn; 320 - 321 - scn = elf_getscn(elf, i); 322 - if (!scn) 323 - return 1; 324 - 325 - if (gelf_getshdr(scn, shdr) != shdr) 326 - return 2; 327 - 328 - *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name); 329 - if (!*shname || !shdr->sh_size) 330 - return 3; 331 - 332 - *data = elf_getdata(scn, 0); 333 - if (!*data || elf_getdata(scn, *data) != NULL) 334 - return 4; 335 - 336 - return 0; 337 - } 338 - 339 - static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, 340 - GElf_Shdr *shdr, struct bpf_insn *insn, 341 - struct bpf_map_data *maps, int nr_maps) 342 - { 343 - int i, nrels; 344 - 345 - nrels = shdr->sh_size / shdr->sh_entsize; 346 - 347 - for (i = 0; i < nrels; i++) { 348 - GElf_Sym sym; 349 - GElf_Rel rel; 350 - unsigned int insn_idx; 351 - bool match = false; 352 - int j, map_idx; 353 - 354 - gelf_getrel(data, i, &rel); 355 - 356 - insn_idx = rel.r_offset / sizeof(struct bpf_insn); 357 - 358 - gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym); 359 - 360 - if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) { 361 - printf("invalid relo for insn[%d].code 0x%x\n", 362 - insn_idx, insn[insn_idx].code); 363 - return 1; 364 - } 365 - insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; 366 - 367 - /* Match FD relocation against recorded map_data[] offset */ 368 - for (map_idx = 0; map_idx < nr_maps; map_idx++) { 369 - if (maps[map_idx].elf_offset == sym.st_value) { 370 - match = true; 371 - break; 372 - } 373 - } 374 - if (match) { 375 - insn[insn_idx].imm = maps[map_idx].fd; 376 - } else { 377 - printf("invalid relo for insn[%d] no map_data match\n", 378 - insn_idx); 379 - return 1; 380 - } 381 - } 382 - 383 - return 0; 384 - } 385 - 386 - static int cmp_symbols(const void *l, const void *r) 387 - { 388 - const GElf_Sym *lsym = (const GElf_Sym *)l; 389 - const GElf_Sym *rsym = (const GElf_Sym *)r; 390 - 391 - if (lsym->st_value < rsym->st_value) 392 - return -1; 393 - else if (lsym->st_value > rsym->st_value) 394 - return 1; 395 - else 396 - return 0; 397 - } 398 - 399 - static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx, 400 - Elf *elf, Elf_Data *symbols, int strtabidx) 401 - { 402 - int map_sz_elf, map_sz_copy; 403 - bool validate_zero = false; 404 - Elf_Data *data_maps; 405 - int i, nr_maps; 406 - GElf_Sym *sym; 407 - Elf_Scn *scn; 408 - int copy_sz; 409 - 410 - if (maps_shndx < 0) 411 - return -EINVAL; 412 - if (!symbols) 413 - return -EINVAL; 414 - 415 - /* Get data for maps section via elf index */ 416 - scn = elf_getscn(elf, maps_shndx); 417 - if (scn) 418 - data_maps = elf_getdata(scn, NULL); 419 - if (!scn || !data_maps) { 420 - printf("Failed to get Elf_Data from maps section %d\n", 421 - maps_shndx); 422 - return -EINVAL; 423 - } 424 - 425 - /* For each map get corrosponding symbol table entry */ 426 - sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym)); 427 - for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) { 428 - assert(nr_maps < MAX_MAPS+1); 429 - if (!gelf_getsym(symbols, i, &sym[nr_maps])) 430 - continue; 431 - if (sym[nr_maps].st_shndx != maps_shndx) 432 - continue; 433 - /* Only increment iif maps section */ 434 - nr_maps++; 435 - } 436 - 437 - /* Align to map_fd[] order, via sort on offset in sym.st_value */ 438 - qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols); 439 - 440 - /* Keeping compatible with ELF maps section changes 441 - * ------------------------------------------------ 442 - * The program size of struct bpf_load_map_def is known by loader 443 - * code, but struct stored in ELF file can be different. 444 - * 445 - * Unfortunately sym[i].st_size is zero. To calculate the 446 - * struct size stored in the ELF file, assume all struct have 447 - * the same size, and simply divide with number of map 448 - * symbols. 449 - */ 450 - map_sz_elf = data_maps->d_size / nr_maps; 451 - map_sz_copy = sizeof(struct bpf_load_map_def); 452 - if (map_sz_elf < map_sz_copy) { 453 - /* 454 - * Backward compat, loading older ELF file with 455 - * smaller struct, keeping remaining bytes zero. 456 - */ 457 - map_sz_copy = map_sz_elf; 458 - } else if (map_sz_elf > map_sz_copy) { 459 - /* 460 - * Forward compat, loading newer ELF file with larger 461 - * struct with unknown features. Assume zero means 462 - * feature not used. Thus, validate rest of struct 463 - * data is zero. 464 - */ 465 - validate_zero = true; 466 - } 467 - 468 - /* Memcpy relevant part of ELF maps data to loader maps */ 469 - for (i = 0; i < nr_maps; i++) { 470 - struct bpf_load_map_def *def; 471 - unsigned char *addr, *end; 472 - const char *map_name; 473 - size_t offset; 474 - 475 - map_name = elf_strptr(elf, strtabidx, sym[i].st_name); 476 - maps[i].name = strdup(map_name); 477 - if (!maps[i].name) { 478 - printf("strdup(%s): %s(%d)\n", map_name, 479 - strerror(errno), errno); 480 - free(sym); 481 - return -errno; 482 - } 483 - 484 - /* Symbol value is offset into ELF maps section data area */ 485 - offset = sym[i].st_value; 486 - def = (struct bpf_load_map_def *)(data_maps->d_buf + offset); 487 - maps[i].elf_offset = offset; 488 - memset(&maps[i].def, 0, sizeof(struct bpf_load_map_def)); 489 - memcpy(&maps[i].def, def, map_sz_copy); 490 - 491 - /* Verify no newer features were requested */ 492 - if (validate_zero) { 493 - addr = (unsigned char *) def + map_sz_copy; 494 - end = (unsigned char *) def + map_sz_elf; 495 - for (; addr < end; addr++) { 496 - if (*addr != 0) { 497 - free(sym); 498 - return -EFBIG; 499 - } 500 - } 501 - } 502 - } 503 - 504 - free(sym); 505 - return nr_maps; 506 - } 507 - 508 - static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) 509 - { 510 - int fd, i, ret, maps_shndx = -1, strtabidx = -1; 511 - Elf *elf; 512 - GElf_Ehdr ehdr; 513 - GElf_Shdr shdr, shdr_prog; 514 - Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL; 515 - char *shname, *shname_prog; 516 - int nr_maps = 0; 517 - 518 - /* reset global variables */ 519 - kern_version = 0; 520 - memset(license, 0, sizeof(license)); 521 - memset(processed_sec, 0, sizeof(processed_sec)); 522 - 523 - if (elf_version(EV_CURRENT) == EV_NONE) 524 - return 1; 525 - 526 - fd = open(path, O_RDONLY, 0); 527 - if (fd < 0) 528 - return 1; 529 - 530 - elf = elf_begin(fd, ELF_C_READ, NULL); 531 - 532 - if (!elf) 533 - return 1; 534 - 535 - if (gelf_getehdr(elf, &ehdr) != &ehdr) 536 - return 1; 537 - 538 - /* clear all kprobes */ 539 - i = write_kprobe_events(""); 540 - 541 - /* scan over all elf sections to get license and map info */ 542 - for (i = 1; i < ehdr.e_shnum; i++) { 543 - 544 - if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) 545 - continue; 546 - 547 - if (0) /* helpful for llvm debugging */ 548 - printf("section %d:%s data %p size %zd link %d flags %d\n", 549 - i, shname, data->d_buf, data->d_size, 550 - shdr.sh_link, (int) shdr.sh_flags); 551 - 552 - if (strcmp(shname, "license") == 0) { 553 - processed_sec[i] = true; 554 - memcpy(license, data->d_buf, data->d_size); 555 - } else if (strcmp(shname, "version") == 0) { 556 - processed_sec[i] = true; 557 - if (data->d_size != sizeof(int)) { 558 - printf("invalid size of version section %zd\n", 559 - data->d_size); 560 - return 1; 561 - } 562 - memcpy(&kern_version, data->d_buf, sizeof(int)); 563 - } else if (strcmp(shname, "maps") == 0) { 564 - int j; 565 - 566 - maps_shndx = i; 567 - data_maps = data; 568 - for (j = 0; j < MAX_MAPS; j++) 569 - map_data[j].fd = -1; 570 - } else if (shdr.sh_type == SHT_SYMTAB) { 571 - strtabidx = shdr.sh_link; 572 - symbols = data; 573 - } 574 - } 575 - 576 - ret = 1; 577 - 578 - if (!symbols) { 579 - printf("missing SHT_SYMTAB section\n"); 580 - goto done; 581 - } 582 - 583 - if (data_maps) { 584 - nr_maps = load_elf_maps_section(map_data, maps_shndx, 585 - elf, symbols, strtabidx); 586 - if (nr_maps < 0) { 587 - printf("Error: Failed loading ELF maps (errno:%d):%s\n", 588 - nr_maps, strerror(-nr_maps)); 589 - goto done; 590 - } 591 - if (load_maps(map_data, nr_maps, fixup_map)) 592 - goto done; 593 - map_data_count = nr_maps; 594 - 595 - processed_sec[maps_shndx] = true; 596 - } 597 - 598 - /* process all relo sections, and rewrite bpf insns for maps */ 599 - for (i = 1; i < ehdr.e_shnum; i++) { 600 - if (processed_sec[i]) 601 - continue; 602 - 603 - if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) 604 - continue; 605 - 606 - if (shdr.sh_type == SHT_REL) { 607 - struct bpf_insn *insns; 608 - 609 - /* locate prog sec that need map fixup (relocations) */ 610 - if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog, 611 - &shdr_prog, &data_prog)) 612 - continue; 613 - 614 - if (shdr_prog.sh_type != SHT_PROGBITS || 615 - !(shdr_prog.sh_flags & SHF_EXECINSTR)) 616 - continue; 617 - 618 - insns = (struct bpf_insn *) data_prog->d_buf; 619 - processed_sec[i] = true; /* relo section */ 620 - 621 - if (parse_relo_and_apply(data, symbols, &shdr, insns, 622 - map_data, nr_maps)) 623 - continue; 624 - } 625 - } 626 - 627 - /* load programs */ 628 - for (i = 1; i < ehdr.e_shnum; i++) { 629 - 630 - if (processed_sec[i]) 631 - continue; 632 - 633 - if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) 634 - continue; 635 - 636 - if (memcmp(shname, "kprobe/", 7) == 0 || 637 - memcmp(shname, "kretprobe/", 10) == 0 || 638 - memcmp(shname, "tracepoint/", 11) == 0 || 639 - memcmp(shname, "raw_tracepoint/", 15) == 0 || 640 - memcmp(shname, "xdp", 3) == 0 || 641 - memcmp(shname, "perf_event", 10) == 0 || 642 - memcmp(shname, "socket", 6) == 0 || 643 - memcmp(shname, "cgroup/", 7) == 0 || 644 - memcmp(shname, "sockops", 7) == 0 || 645 - memcmp(shname, "sk_skb", 6) == 0 || 646 - memcmp(shname, "sk_msg", 6) == 0) { 647 - ret = load_and_attach(shname, data->d_buf, 648 - data->d_size); 649 - if (ret != 0) 650 - goto done; 651 - } 652 - } 653 - 654 - done: 655 - close(fd); 656 - return ret; 657 - } 658 - 659 - int load_bpf_file(char *path) 660 - { 661 - return do_load_bpf_file(path, NULL); 662 - } 663 - 664 - int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map) 665 - { 666 - return do_load_bpf_file(path, fixup_map); 667 - }

-57

samples/bpf/bpf_load.h

··· 1 - /* SPDX-License-Identifier: GPL-2.0 */ 2 - #ifndef __BPF_LOAD_H 3 - #define __BPF_LOAD_H 4 - 5 - #include <bpf/bpf.h> 6 - 7 - #define MAX_MAPS 32 8 - #define MAX_PROGS 32 9 - 10 - struct bpf_load_map_def { 11 - unsigned int type; 12 - unsigned int key_size; 13 - unsigned int value_size; 14 - unsigned int max_entries; 15 - unsigned int map_flags; 16 - unsigned int inner_map_idx; 17 - unsigned int numa_node; 18 - }; 19 - 20 - struct bpf_map_data { 21 - int fd; 22 - char *name; 23 - size_t elf_offset; 24 - struct bpf_load_map_def def; 25 - }; 26 - 27 - typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx); 28 - 29 - extern int prog_fd[MAX_PROGS]; 30 - extern int event_fd[MAX_PROGS]; 31 - extern char bpf_log_buf[BPF_LOG_BUF_SIZE]; 32 - extern int prog_cnt; 33 - 34 - /* There is a one-to-one mapping between map_fd[] and map_data[]. 35 - * The map_data[] just contains more rich info on the given map. 36 - */ 37 - extern int map_fd[MAX_MAPS]; 38 - extern struct bpf_map_data map_data[MAX_MAPS]; 39 - extern int map_data_count; 40 - 41 - /* parses elf file compiled by llvm .c->.o 42 - * . parses 'maps' section and creates maps via BPF syscall 43 - * . parses 'license' section and passes it to syscall 44 - * . parses elf relocations for BPF maps and adjusts BPF_LD_IMM64 insns by 45 - * storing map_fd into insn->imm and marking such insns as BPF_PSEUDO_MAP_FD 46 - * . loads eBPF programs via BPF syscall 47 - * 48 - * One ELF file can contain multiple BPF programs which will be loaded 49 - * and their FDs stored stored in prog_fd array 50 - * 51 - * returns zero on success 52 - */ 53 - int load_bpf_file(char *path); 54 - int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map); 55 - 56 - int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags); 57 - #endif

+14 -18

samples/bpf/do_hbm_test.sh

··· 91 91 flags="" 92 92 do_stats=0 93 93 94 + BPFFS=/sys/fs/bpf 95 + function config_bpffs () { 96 + if mount | grep $BPFFS > /dev/null; then 97 + echo "bpffs already mounted" 98 + else 99 + echo "bpffs not mounted. Mounting..." 100 + mount -t bpf none $BPFFS 101 + fi 102 + } 103 + 94 104 function start_hbm () { 95 105 rm -f hbm.out 96 106 echo "./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog" > hbm.out ··· 202 192 } 203 193 204 194 processArgs 195 + config_bpffs 205 196 206 197 if [ $debug_flag -eq 1 ] ; then 207 198 rm -f hbm_out.log ··· 212 201 usleep 100000 213 202 214 203 host=`hostname` 215 - cg_base_dir=/sys/fs/cgroup 204 + cg_base_dir=/sys/fs/cgroup/unified 216 205 cg_dir="$cg_base_dir/cgroup-test-work-dir/hbm$id" 217 206 218 207 echo $$ >> $cg_dir/cgroup.procs ··· 422 411 423 412 sleep 1 424 413 425 - # Detach any BPF programs that may have lingered 426 - ttx=`bpftool cgroup tree | grep hbm` 427 - v=2 428 - for x in $ttx ; do 429 - if [ "${x:0:36}" == "/sys/fs/cgroup/cgroup-test-work-dir/" ] ; then 430 - cg=$x ; v=0 431 - else 432 - if [ $v -eq 0 ] ; then 433 - id=$x ; v=1 434 - else 435 - if [ $v -eq 1 ] ; then 436 - type=$x ; bpftool cgroup detach $cg $type id $id 437 - v=0 438 - fi 439 - fi 440 - fi 441 - done 414 + # Detach any pinned BPF programs that may have lingered 415 + rm -rf $BPFFS/hbm* 442 416 443 417 if [ $use_netperf -ne 0 ] ; then 444 418 if [ "$server" == "" ] ; then

+60 -53

samples/bpf/hbm.c

··· 46 46 #include <bpf/bpf.h> 47 47 #include <getopt.h> 48 48 49 - #include "bpf_load.h" 50 49 #include "bpf_rlimit.h" 51 50 #include "cgroup_helpers.h" 52 51 #include "hbm.h" ··· 69 70 70 71 #define DEBUGFS "/sys/kernel/debug/tracing/" 71 72 72 - struct bpf_object *obj; 73 - int bpfprog_fd; 74 - int cgroup_storage_fd; 73 + static struct bpf_program *bpf_prog; 74 + static struct bpf_object *obj; 75 + static int queue_stats_fd; 75 76 76 77 static void read_trace_pipe2(void) 77 78 { ··· 120 121 121 122 static int prog_load(char *prog) 122 123 { 123 - struct bpf_prog_load_attr prog_load_attr = { 124 - .prog_type = BPF_PROG_TYPE_CGROUP_SKB, 125 - .file = prog, 126 - .expected_attach_type = BPF_CGROUP_INET_EGRESS, 127 - }; 128 - int map_fd; 129 - struct bpf_map *map; 130 - 131 - int ret = 0; 132 - 133 - if (access(prog, O_RDONLY) < 0) { 134 - printf("Error accessing file %s: %s\n", prog, strerror(errno)); 124 + obj = bpf_object__open_file(prog, NULL); 125 + if (libbpf_get_error(obj)) { 126 + printf("ERROR: opening BPF object file failed\n"); 135 127 return 1; 136 128 } 137 - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &bpfprog_fd)) 138 - ret = 1; 139 - if (!ret) { 140 - map = bpf_object__find_map_by_name(obj, "queue_stats"); 141 - map_fd = bpf_map__fd(map); 142 - if (map_fd < 0) { 143 - printf("Map not found: %s\n", strerror(map_fd)); 144 - ret = 1; 145 - } 129 + 130 + /* load BPF program */ 131 + if (bpf_object__load(obj)) { 132 + printf("ERROR: loading BPF object file failed\n"); 133 + goto err; 146 134 } 147 135 148 - if (ret) { 149 - printf("ERROR: bpf_prog_load_xattr failed for: %s\n", prog); 150 - printf(" Output from verifier:\n%s\n------\n", bpf_log_buf); 151 - ret = -1; 152 - } else { 153 - ret = map_fd; 136 + bpf_prog = bpf_object__find_program_by_title(obj, "cgroup_skb/egress"); 137 + if (!bpf_prog) { 138 + printf("ERROR: finding a prog in obj file failed\n"); 139 + goto err; 154 140 } 155 141 156 - return ret; 142 + queue_stats_fd = bpf_object__find_map_fd_by_name(obj, "queue_stats"); 143 + if (queue_stats_fd < 0) { 144 + printf("ERROR: finding a map in obj file failed\n"); 145 + goto err; 146 + } 147 + 148 + return 0; 149 + 150 + err: 151 + bpf_object__close(obj); 152 + return 1; 157 153 } 158 154 159 155 static int run_bpf_prog(char *prog, int cg_id) 160 156 { 161 - int map_fd; 162 - int rc = 0; 157 + struct hbm_queue_stats qstats = {0}; 158 + char cg_dir[100], cg_pin_path[100]; 159 + struct bpf_link *link = NULL; 163 160 int key = 0; 164 161 int cg1 = 0; 165 - int type = BPF_CGROUP_INET_EGRESS; 166 - char cg_dir[100]; 167 - struct hbm_queue_stats qstats = {0}; 162 + int rc = 0; 168 163 169 164 sprintf(cg_dir, "/hbm%d", cg_id); 170 - map_fd = prog_load(prog); 171 - if (map_fd == -1) 172 - return 1; 165 + rc = prog_load(prog); 166 + if (rc != 0) 167 + return rc; 173 168 174 169 if (setup_cgroup_environment()) { 175 170 printf("ERROR: setting cgroup environment\n"); ··· 183 190 qstats.stats = stats_flag ? 1 : 0; 184 191 qstats.loopback = loopback_flag ? 1 : 0; 185 192 qstats.no_cn = no_cn_flag ? 1 : 0; 186 - if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) { 193 + if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) { 187 194 printf("ERROR: Could not update map element\n"); 188 195 goto err; 189 196 } 190 197 191 198 if (!outFlag) 192 - type = BPF_CGROUP_INET_INGRESS; 193 - if (bpf_prog_attach(bpfprog_fd, cg1, type, 0)) { 194 - printf("ERROR: bpf_prog_attach fails!\n"); 195 - log_err("Attaching prog"); 199 + bpf_program__set_expected_attach_type(bpf_prog, BPF_CGROUP_INET_INGRESS); 200 + 201 + link = bpf_program__attach_cgroup(bpf_prog, cg1); 202 + if (libbpf_get_error(link)) { 203 + fprintf(stderr, "ERROR: bpf_program__attach_cgroup failed\n"); 204 + goto err; 205 + } 206 + 207 + sprintf(cg_pin_path, "/sys/fs/bpf/hbm%d", cg_id); 208 + rc = bpf_link__pin(link, cg_pin_path); 209 + if (rc < 0) { 210 + printf("ERROR: bpf_link__pin failed: %d\n", rc); 196 211 goto err; 197 212 } 198 213 ··· 214 213 #define DELTA_RATE_CHECK 10000 /* in us */ 215 214 #define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */ 216 215 217 - bpf_map_lookup_elem(map_fd, &key, &qstats); 216 + bpf_map_lookup_elem(queue_stats_fd, &key, &qstats); 218 217 if (gettimeofday(&t0, NULL) < 0) 219 218 do_error("gettimeofday failed", true); 220 219 t_last = t0; ··· 243 242 fclose(fin); 244 243 printf(" new_eth_tx_bytes:%llu\n", 245 244 new_eth_tx_bytes); 246 - bpf_map_lookup_elem(map_fd, &key, &qstats); 245 + bpf_map_lookup_elem(queue_stats_fd, &key, &qstats); 247 246 new_cg_tx_bytes = qstats.bytes_total; 248 247 delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes; 249 248 last_eth_tx_bytes = new_eth_tx_bytes; ··· 290 289 rate = minRate; 291 290 qstats.rate = rate; 292 291 } 293 - if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) 292 + if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) 294 293 do_error("update map element fails", false); 295 294 } 296 295 } else { 297 296 sleep(dur); 298 297 } 299 298 // Get stats! 300 - if (stats_flag && bpf_map_lookup_elem(map_fd, &key, &qstats)) { 299 + if (stats_flag && bpf_map_lookup_elem(queue_stats_fd, &key, &qstats)) { 301 300 char fname[100]; 302 301 FILE *fout; 303 302 ··· 395 394 396 395 if (debugFlag) 397 396 read_trace_pipe2(); 398 - return rc; 397 + goto cleanup; 398 + 399 399 err: 400 400 rc = 1; 401 401 402 - if (cg1) 403 - close(cg1); 404 - cleanup_cgroup_environment(); 402 + cleanup: 403 + bpf_link__destroy(link); 404 + bpf_object__close(obj); 405 405 406 + if (cg1 != -1) 407 + close(cg1); 408 + 409 + if (rc != 0) 410 + cleanup_cgroup_environment(); 406 411 return rc; 407 412 } 408 413

+1 -1

samples/bpf/hbm_kern.h

··· 69 69 __uint(type, BPF_MAP_TYPE_ARRAY); 70 70 __uint(max_entries, 1); 71 71 __type(key, u32); 72 - __type(value, struct hvm_queue_stats); 72 + __type(value, struct hbm_queue_stats); 73 73 } queue_stats SEC(".maps"); 74 74 75 75 struct hbm_pkt_info {

+12 -12

samples/bpf/ibumad_kern.c

··· 16 16 #include <bpf/bpf_helpers.h> 17 17 18 18 19 - struct bpf_map_def SEC("maps") read_count = { 20 - .type = BPF_MAP_TYPE_ARRAY, 21 - .key_size = sizeof(u32), /* class; u32 required */ 22 - .value_size = sizeof(u64), /* count of mads read */ 23 - .max_entries = 256, /* Room for all Classes */ 24 - }; 19 + struct { 20 + __uint(type, BPF_MAP_TYPE_ARRAY); 21 + __type(key, u32); /* class; u32 required */ 22 + __type(value, u64); /* count of mads read */ 23 + __uint(max_entries, 256); /* Room for all Classes */ 24 + } read_count SEC(".maps"); 25 25 26 - struct bpf_map_def SEC("maps") write_count = { 27 - .type = BPF_MAP_TYPE_ARRAY, 28 - .key_size = sizeof(u32), /* class; u32 required */ 29 - .value_size = sizeof(u64), /* count of mads written */ 30 - .max_entries = 256, /* Room for all Classes */ 31 - }; 26 + struct { 27 + __uint(type, BPF_MAP_TYPE_ARRAY); 28 + __type(key, u32); /* class; u32 required */ 29 + __type(value, u64); /* count of mads written */ 30 + __uint(max_entries, 256); /* Room for all Classes */ 31 + } write_count SEC(".maps"); 32 32 33 33 #undef DEBUG 34 34 #ifndef DEBUG

+54 -17

samples/bpf/ibumad_user.c

··· 23 23 #include <getopt.h> 24 24 #include <net/if.h> 25 25 26 - #include "bpf_load.h" 26 + #include <bpf/bpf.h> 27 27 #include "bpf_util.h" 28 28 #include <bpf/libbpf.h> 29 + 30 + static struct bpf_link *tp_links[3]; 31 + static struct bpf_object *obj; 32 + static int map_fd[2]; 33 + static int tp_cnt; 29 34 30 35 static void dump_counts(int fd) 31 36 { ··· 58 53 static void dump_exit(int sig) 59 54 { 60 55 dump_all_counts(); 56 + /* Detach tracepoints */ 57 + while (tp_cnt) 58 + bpf_link__destroy(tp_links[--tp_cnt]); 59 + 60 + bpf_object__close(obj); 61 61 exit(0); 62 62 } 63 63 ··· 83 73 84 74 int main(int argc, char **argv) 85 75 { 76 + struct bpf_program *prog; 86 77 unsigned long delay = 5; 78 + char filename[256]; 87 79 int longindex = 0; 88 - int opt; 89 - char bpf_file[256]; 90 - 91 - /* Create the eBPF kernel code path name. 92 - * This follows the pattern of all of the other bpf samples 93 - */ 94 - snprintf(bpf_file, sizeof(bpf_file), "%s_kern.o", argv[0]); 95 - 96 - /* Do one final dump when exiting */ 97 - signal(SIGINT, dump_exit); 98 - signal(SIGTERM, dump_exit); 80 + int opt, err = -1; 99 81 100 82 while ((opt = getopt_long(argc, argv, "hd:rSw", 101 83 long_options, &longindex)) != -1) { ··· 109 107 } 110 108 } 111 109 112 - if (load_bpf_file(bpf_file)) { 113 - fprintf(stderr, "ERROR: failed to load eBPF from file : %s\n", 114 - bpf_file); 115 - return 1; 110 + /* Do one final dump when exiting */ 111 + signal(SIGINT, dump_exit); 112 + signal(SIGTERM, dump_exit); 113 + 114 + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 115 + obj = bpf_object__open_file(filename, NULL); 116 + if (libbpf_get_error(obj)) { 117 + fprintf(stderr, "ERROR: opening BPF object file failed\n"); 118 + return err; 119 + } 120 + 121 + /* load BPF program */ 122 + if (bpf_object__load(obj)) { 123 + fprintf(stderr, "ERROR: loading BPF object file failed\n"); 124 + goto cleanup; 125 + } 126 + 127 + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "read_count"); 128 + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "write_count"); 129 + if (map_fd[0] < 0 || map_fd[1] < 0) { 130 + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); 131 + goto cleanup; 132 + } 133 + 134 + bpf_object__for_each_program(prog, obj) { 135 + tp_links[tp_cnt] = bpf_program__attach(prog); 136 + if (libbpf_get_error(tp_links[tp_cnt])) { 137 + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); 138 + tp_links[tp_cnt] = NULL; 139 + goto cleanup; 140 + } 141 + tp_cnt++; 116 142 } 117 143 118 144 while (1) { 119 145 sleep(delay); 120 146 dump_all_counts(); 121 147 } 148 + err = 0; 122 149 123 - return 0; 150 + cleanup: 151 + /* Detach tracepoints */ 152 + while (tp_cnt) 153 + bpf_link__destroy(tp_links[--tp_cnt]); 154 + 155 + bpf_object__close(obj); 156 + return err; 124 157 }

+2

samples/bpf/lwt_len_hist.sh

··· 8 8 TRACE_ROOT=/sys/kernel/debug/tracing 9 9 10 10 function cleanup { 11 + # To reset saved histogram, remove pinned map 12 + rm /sys/fs/bpf/tc/globals/lwt_len_hist_map 11 13 ip route del 192.168.253.2/32 dev $VETH0 2> /dev/null 12 14 ip link del $VETH0 2> /dev/null 13 15 ip link del $VETH1 2> /dev/null

-6

samples/bpf/map_perf_test_user.c

··· 421 421 422 422 int main(int argc, char **argv) 423 423 { 424 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 425 424 int nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); 426 425 struct bpf_link *links[8]; 427 426 struct bpf_program *prog; ··· 428 429 struct bpf_map *map; 429 430 char filename[256]; 430 431 int i = 0; 431 - 432 - if (setrlimit(RLIMIT_MEMLOCK, &r)) { 433 - perror("setrlimit(RLIMIT_MEMLOCK)"); 434 - return 1; 435 - } 436 432 437 433 if (argc > 1) 438 434 test_flags = atoi(argv[1]) ? : test_flags;

-6

samples/bpf/offwaketime_user.c

··· 95 95 96 96 int main(int argc, char **argv) 97 97 { 98 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 99 98 struct bpf_object *obj = NULL; 100 99 struct bpf_link *links[2]; 101 100 struct bpf_program *prog; 102 101 int delay = 1, i = 0; 103 102 char filename[256]; 104 - 105 - if (setrlimit(RLIMIT_MEMLOCK, &r)) { 106 - perror("setrlimit(RLIMIT_MEMLOCK)"); 107 - return 1; 108 - } 109 103 110 104 if (load_kallsyms()) { 111 105 printf("failed to process /proc/kallsyms\n");

-2

samples/bpf/sockex2_user.c

··· 16 16 17 17 int main(int ac, char **argv) 18 18 { 19 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 20 19 struct bpf_object *obj; 21 20 int map_fd, prog_fd; 22 21 char filename[256]; ··· 23 24 FILE *f; 24 25 25 26 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 26 - setrlimit(RLIMIT_MEMLOCK, &r); 27 27 28 28 if (bpf_prog_load(filename, BPF_PROG_TYPE_SOCKET_FILTER, 29 29 &obj, &prog_fd))

-2

samples/bpf/sockex3_user.c

··· 26 26 int main(int argc, char **argv) 27 27 { 28 28 int i, sock, key, fd, main_prog_fd, jmp_table_fd, hash_map_fd; 29 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 30 29 struct bpf_program *prog; 31 30 struct bpf_object *obj; 32 31 const char *section; ··· 33 34 FILE *f; 34 35 35 36 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 36 - setrlimit(RLIMIT_MEMLOCK, &r); 37 37 38 38 obj = bpf_object__open_file(filename, NULL); 39 39 if (libbpf_get_error(obj)) {

-6

samples/bpf/spintest_user.c

··· 10 10 11 11 int main(int ac, char **argv) 12 12 { 13 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 14 13 char filename[256], symbol[256]; 15 14 struct bpf_object *obj = NULL; 16 15 struct bpf_link *links[20]; ··· 18 19 int map_fd, i, j = 0; 19 20 const char *section; 20 21 struct ksym *sym; 21 - 22 - if (setrlimit(RLIMIT_MEMLOCK, &r)) { 23 - perror("setrlimit(RLIMIT_MEMLOCK)"); 24 - return 1; 25 - } 26 22 27 23 if (load_kallsyms()) { 28 24 printf("failed to process /proc/kallsyms\n");

-2

samples/bpf/syscall_tp_user.c

··· 115 115 116 116 int main(int argc, char **argv) 117 117 { 118 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 119 118 int opt, num_progs = 1; 120 119 char filename[256]; 121 120 ··· 130 131 } 131 132 } 132 133 133 - setrlimit(RLIMIT_MEMLOCK, &r); 134 134 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 135 135 136 136 return test(filename, num_progs);

+73 -32

samples/bpf/task_fd_query_user.c

··· 15 15 #include <sys/stat.h> 16 16 #include <linux/perf_event.h> 17 17 18 + #include <bpf/bpf.h> 18 19 #include <bpf/libbpf.h> 19 - #include "bpf_load.h" 20 20 #include "bpf_util.h" 21 21 #include "perf-sys.h" 22 22 #include "trace_helpers.h" 23 + 24 + static struct bpf_program *progs[2]; 25 + static struct bpf_link *links[2]; 23 26 24 27 #define CHECK_PERROR_RET(condition) ({ \ 25 28 int __ret = !!(condition); \ ··· 89 86 return ret; 90 87 } 91 88 92 - static int test_debug_fs_kprobe(int prog_fd_idx, const char *fn_name, 89 + static int test_debug_fs_kprobe(int link_idx, const char *fn_name, 93 90 __u32 expected_fd_type) 94 91 { 95 92 __u64 probe_offset, probe_addr; 96 93 __u32 len, prog_id, fd_type; 94 + int err, event_fd; 97 95 char buf[256]; 98 - int err; 99 96 100 97 len = sizeof(buf); 101 - err = bpf_task_fd_query(getpid(), event_fd[prog_fd_idx], 0, buf, &len, 98 + event_fd = bpf_link__fd(links[link_idx]); 99 + err = bpf_task_fd_query(getpid(), event_fd, 0, buf, &len, 102 100 &prog_id, &fd_type, &probe_offset, 103 101 &probe_addr); 104 102 if (err < 0) { 105 103 printf("FAIL: %s, for event_fd idx %d, fn_name %s\n", 106 - __func__, prog_fd_idx, fn_name); 104 + __func__, link_idx, fn_name); 107 105 perror(" :"); 108 106 return -1; 109 107 } ··· 112 108 fd_type != expected_fd_type || 113 109 probe_offset != 0x0 || probe_addr != 0x0) { 114 110 printf("FAIL: bpf_trace_event_query(event_fd[%d]):\n", 115 - prog_fd_idx); 111 + link_idx); 116 112 printf("buf: %s, fd_type: %u, probe_offset: 0x%llx," 117 113 " probe_addr: 0x%llx\n", 118 114 buf, fd_type, probe_offset, probe_addr); ··· 129 125 int is_return_bit = bpf_get_retprobe_bit(event_type); 130 126 int type = bpf_find_probe_type(event_type); 131 127 struct perf_event_attr attr = {}; 132 - int fd; 128 + struct bpf_link *link; 129 + int fd, err = -1; 133 130 134 131 if (type < 0 || is_return_bit < 0) { 135 132 printf("FAIL: %s incorrect type (%d) or is_return_bit (%d)\n", 136 133 __func__, type, is_return_bit); 137 - return -1; 134 + return err; 138 135 } 139 136 140 137 attr.sample_period = 1; ··· 154 149 attr.type = type; 155 150 156 151 fd = sys_perf_event_open(&attr, -1, 0, -1, 0); 157 - CHECK_PERROR_RET(fd < 0); 152 + link = bpf_program__attach_perf_event(progs[0], fd); 153 + if (libbpf_get_error(link)) { 154 + printf("ERROR: bpf_program__attach_perf_event failed\n"); 155 + link = NULL; 156 + close(fd); 157 + goto cleanup; 158 + } 158 159 159 - CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0); 160 - CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0); 161 160 CHECK_PERROR_RET(bpf_task_fd_query(getpid(), fd, 0, buf, buf_len, 162 161 prog_id, fd_type, probe_offset, probe_addr) < 0); 162 + err = 0; 163 163 164 - return 0; 164 + cleanup: 165 + bpf_link__destroy(link); 166 + return err; 165 167 } 166 168 167 169 static int test_nondebug_fs_probe(const char *event_type, const char *name, ··· 227 215 228 216 static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return) 229 217 { 218 + char buf[256], event_alias[sizeof("test_1234567890")]; 230 219 const char *event_type = "uprobe"; 231 220 struct perf_event_attr attr = {}; 232 - char buf[256], event_alias[sizeof("test_1234567890")]; 233 221 __u64 probe_offset, probe_addr; 234 222 __u32 len, prog_id, fd_type; 235 - int err, res, kfd, efd; 223 + int err = -1, res, kfd, efd; 224 + struct bpf_link *link; 236 225 ssize_t bytes; 237 226 238 227 snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", 239 228 event_type); 240 - kfd = open(buf, O_WRONLY | O_APPEND, 0); 229 + kfd = open(buf, O_WRONLY | O_TRUNC, 0); 241 230 CHECK_PERROR_RET(kfd < 0); 242 231 243 232 res = snprintf(event_alias, sizeof(event_alias), "test_%d", getpid()); ··· 267 254 attr.type = PERF_TYPE_TRACEPOINT; 268 255 attr.sample_period = 1; 269 256 attr.wakeup_events = 1; 257 + 270 258 kfd = sys_perf_event_open(&attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); 271 - CHECK_PERROR_RET(kfd < 0); 272 - CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0); 273 - CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_ENABLE, 0) < 0); 259 + link = bpf_program__attach_perf_event(progs[0], kfd); 260 + if (libbpf_get_error(link)) { 261 + printf("ERROR: bpf_program__attach_perf_event failed\n"); 262 + link = NULL; 263 + close(kfd); 264 + goto cleanup; 265 + } 274 266 275 267 len = sizeof(buf); 276 268 err = bpf_task_fd_query(getpid(), kfd, 0, buf, &len, ··· 301 283 probe_offset); 302 284 return -1; 303 285 } 286 + err = 0; 304 287 305 - close(kfd); 306 - return 0; 288 + cleanup: 289 + bpf_link__destroy(link); 290 + return err; 307 291 } 308 292 309 293 int main(int argc, char **argv) 310 294 { 311 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 312 295 extern char __executable_start; 313 296 char filename[256], buf[256]; 314 297 __u64 uprobe_file_offset; 315 - 316 - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 317 - if (setrlimit(RLIMIT_MEMLOCK, &r)) { 318 - perror("setrlimit(RLIMIT_MEMLOCK)"); 319 - return 1; 320 - } 298 + struct bpf_program *prog; 299 + struct bpf_object *obj; 300 + int i = 0, err = -1; 321 301 322 302 if (load_kallsyms()) { 323 303 printf("failed to process /proc/kallsyms\n"); 324 - return 1; 304 + return err; 325 305 } 326 306 327 - if (load_bpf_file(filename)) { 328 - printf("%s", bpf_log_buf); 329 - return 1; 307 + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 308 + obj = bpf_object__open_file(filename, NULL); 309 + if (libbpf_get_error(obj)) { 310 + fprintf(stderr, "ERROR: opening BPF object file failed\n"); 311 + return err; 312 + } 313 + 314 + /* load BPF program */ 315 + if (bpf_object__load(obj)) { 316 + fprintf(stderr, "ERROR: loading BPF object file failed\n"); 317 + goto cleanup; 318 + } 319 + 320 + bpf_object__for_each_program(prog, obj) { 321 + progs[i] = prog; 322 + links[i] = bpf_program__attach(progs[i]); 323 + if (libbpf_get_error(links[i])) { 324 + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); 325 + links[i] = NULL; 326 + goto cleanup; 327 + } 328 + i++; 330 329 } 331 330 332 331 /* test two functions in the corresponding *_kern.c file */ ··· 413 378 false)); 414 379 CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset, 415 380 true)); 381 + err = 0; 416 382 417 - return 0; 383 + cleanup: 384 + for (i--; i >= 0; i--) 385 + bpf_link__destroy(links[i]); 386 + 387 + bpf_object__close(obj); 388 + return err; 418 389 }

+48 -21

samples/bpf/test_cgrp2_sock2.c

··· 20 20 #include <net/if.h> 21 21 #include <linux/bpf.h> 22 22 #include <bpf/bpf.h> 23 + #include <bpf/libbpf.h> 23 24 24 25 #include "bpf_insn.h" 25 - #include "bpf_load.h" 26 26 27 27 static int usage(const char *argv0) 28 28 { ··· 32 32 33 33 int main(int argc, char **argv) 34 34 { 35 - int cg_fd, ret, filter_id = 0; 35 + int cg_fd, err, ret = EXIT_FAILURE, filter_id = 0, prog_cnt = 0; 36 + const char *link_pin_path = "/sys/fs/bpf/test_cgrp2_sock2"; 37 + struct bpf_link *link = NULL; 38 + struct bpf_program *progs[2]; 39 + struct bpf_program *prog; 40 + struct bpf_object *obj; 36 41 37 42 if (argc < 3) 38 43 return usage(argv[0]); 39 44 40 - cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY); 41 - if (cg_fd < 0) { 42 - printf("Failed to open cgroup path: '%s'\n", strerror(errno)); 43 - return EXIT_FAILURE; 44 - } 45 - 46 - if (load_bpf_file(argv[2])) 47 - return EXIT_FAILURE; 48 - 49 - printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf); 50 - 51 45 if (argc > 3) 52 46 filter_id = atoi(argv[3]); 53 47 48 + cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY); 49 + if (cg_fd < 0) { 50 + printf("Failed to open cgroup path: '%s'\n", strerror(errno)); 51 + return ret; 52 + } 53 + 54 + obj = bpf_object__open_file(argv[2], NULL); 55 + if (libbpf_get_error(obj)) { 56 + printf("ERROR: opening BPF object file failed\n"); 57 + return ret; 58 + } 59 + 60 + bpf_object__for_each_program(prog, obj) { 61 + progs[prog_cnt] = prog; 62 + prog_cnt++; 63 + } 64 + 54 65 if (filter_id >= prog_cnt) { 55 66 printf("Invalid program id; program not found in file\n"); 56 - return EXIT_FAILURE; 67 + goto cleanup; 57 68 } 58 69 59 - ret = bpf_prog_attach(prog_fd[filter_id], cg_fd, 60 - BPF_CGROUP_INET_SOCK_CREATE, 0); 61 - if (ret < 0) { 62 - printf("Failed to attach prog to cgroup: '%s'\n", 63 - strerror(errno)); 64 - return EXIT_FAILURE; 70 + /* load BPF program */ 71 + if (bpf_object__load(obj)) { 72 + printf("ERROR: loading BPF object file failed\n"); 73 + goto cleanup; 65 74 } 66 75 67 - return EXIT_SUCCESS; 76 + link = bpf_program__attach_cgroup(progs[filter_id], cg_fd); 77 + if (libbpf_get_error(link)) { 78 + printf("ERROR: bpf_program__attach failed\n"); 79 + link = NULL; 80 + goto cleanup; 81 + } 82 + 83 + err = bpf_link__pin(link, link_pin_path); 84 + if (err < 0) { 85 + printf("ERROR: bpf_link__pin failed: %d\n", err); 86 + goto cleanup; 87 + } 88 + 89 + ret = EXIT_SUCCESS; 90 + 91 + cleanup: 92 + bpf_link__destroy(link); 93 + bpf_object__close(obj); 94 + return ret; 68 95 }

+17 -4

samples/bpf/test_cgrp2_sock2.sh

··· 1 1 #!/bin/bash 2 2 # SPDX-License-Identifier: GPL-2.0 3 3 4 + BPFFS=/sys/fs/bpf 5 + LINK_PIN=$BPFFS/test_cgrp2_sock2 6 + 4 7 function config_device { 5 8 ip netns add at_ns0 6 9 ip link add veth0 type veth peer name veth0b ··· 24 21 echo $$ >> /tmp/cgroupv2/foo/cgroup.procs 25 22 } 26 23 24 + function config_bpffs { 25 + if mount | grep $BPFFS > /dev/null; then 26 + echo "bpffs already mounted" 27 + else 28 + echo "bpffs not mounted. Mounting..." 29 + mount -t bpf none $BPFFS 30 + fi 31 + } 27 32 28 33 function attach_bpf { 29 - test_cgrp2_sock2 /tmp/cgroupv2/foo sock_flags_kern.o $1 34 + ./test_cgrp2_sock2 /tmp/cgroupv2/foo sock_flags_kern.o $1 30 35 [ $? -ne 0 ] && exit 1 31 36 } 32 37 33 38 function cleanup { 34 - if [ -d /tmp/cgroupv2/foo ]; then 35 - test_cgrp2_sock -d /tmp/cgroupv2/foo 36 - fi 39 + rm -rf $LINK_PIN 37 40 ip link del veth0b 38 41 ip netns delete at_ns0 39 42 umount /tmp/cgroupv2 ··· 51 42 set -e 52 43 config_device 53 44 config_cgroup 45 + config_bpffs 54 46 set +e 55 47 56 48 # ··· 71 61 cleanup 72 62 exit 1 73 63 fi 64 + 65 + rm -rf $LINK_PIN 66 + sleep 1 # Wait for link detach 74 67 75 68 # 76 69 # Test 2 - fail ping

-3

samples/bpf/test_lru_dist.c

··· 489 489 490 490 int main(int argc, char **argv) 491 491 { 492 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 493 492 int map_flags[] = {0, BPF_F_NO_COMMON_LRU}; 494 493 const char *dist_file; 495 494 int nr_tasks = 1; ··· 506 507 nr_tasks = atoi(argv[3]); 507 508 508 509 setbuf(stdout, NULL); 509 - 510 - assert(!setrlimit(RLIMIT_MEMLOCK, &r)); 511 510 512 511 srand(time(NULL)); 513 512

samples/bpf/test_lwt_bpf.sh

-6

samples/bpf/test_map_in_map_user.c

··· 114 114 115 115 int main(int argc, char **argv) 116 116 { 117 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 118 117 struct bpf_link *link = NULL; 119 118 struct bpf_program *prog; 120 119 struct bpf_object *obj; 121 120 char filename[256]; 122 - 123 - if (setrlimit(RLIMIT_MEMLOCK, &r)) { 124 - perror("setrlimit(RLIMIT_MEMLOCK)"); 125 - return 1; 126 - } 127 121 128 122 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 129 123 obj = bpf_object__open_file(filename, NULL);

+59 -25

samples/bpf/test_overhead_user.c

··· 18 18 #include <time.h> 19 19 #include <sys/resource.h> 20 20 #include <bpf/bpf.h> 21 - #include "bpf_load.h" 21 + #include <bpf/libbpf.h> 22 22 23 23 #define MAX_CNT 1000000 24 + 25 + static struct bpf_link *links[2]; 26 + static struct bpf_object *obj; 27 + static int cnt; 24 28 25 29 static __u64 time_get_ns(void) 26 30 { ··· 119 115 } 120 116 } 121 117 118 + static int load_progs(char *filename) 119 + { 120 + struct bpf_program *prog; 121 + int err = 0; 122 + 123 + obj = bpf_object__open_file(filename, NULL); 124 + err = libbpf_get_error(obj); 125 + if (err < 0) { 126 + fprintf(stderr, "ERROR: opening BPF object file failed\n"); 127 + return err; 128 + } 129 + 130 + /* load BPF program */ 131 + err = bpf_object__load(obj); 132 + if (err < 0) { 133 + fprintf(stderr, "ERROR: loading BPF object file failed\n"); 134 + return err; 135 + } 136 + 137 + bpf_object__for_each_program(prog, obj) { 138 + links[cnt] = bpf_program__attach(prog); 139 + err = libbpf_get_error(links[cnt]); 140 + if (err < 0) { 141 + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); 142 + links[cnt] = NULL; 143 + return err; 144 + } 145 + cnt++; 146 + } 147 + 148 + return err; 149 + } 150 + 122 151 static void unload_progs(void) 123 152 { 124 - close(prog_fd[0]); 125 - close(prog_fd[1]); 126 - close(event_fd[0]); 127 - close(event_fd[1]); 153 + while (cnt) 154 + bpf_link__destroy(links[--cnt]); 155 + 156 + bpf_object__close(obj); 128 157 } 129 158 130 159 int main(int argc, char **argv) 131 160 { 132 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 133 - char filename[256]; 134 - int num_cpu = 8; 161 + int num_cpu = sysconf(_SC_NPROCESSORS_ONLN); 135 162 int test_flags = ~0; 163 + char filename[256]; 164 + int err = 0; 136 165 137 - setrlimit(RLIMIT_MEMLOCK, &r); 138 166 139 167 if (argc > 1) 140 168 test_flags = atoi(argv[1]) ? : test_flags; ··· 181 145 if (test_flags & 0xC) { 182 146 snprintf(filename, sizeof(filename), 183 147 "%s_kprobe_kern.o", argv[0]); 184 - if (load_bpf_file(filename)) { 185 - printf("%s", bpf_log_buf); 186 - return 1; 187 - } 148 + 188 149 printf("w/KPROBE\n"); 189 - run_perf_test(num_cpu, test_flags >> 2); 150 + err = load_progs(filename); 151 + if (!err) 152 + run_perf_test(num_cpu, test_flags >> 2); 153 + 190 154 unload_progs(); 191 155 } 192 156 193 157 if (test_flags & 0x30) { 194 158 snprintf(filename, sizeof(filename), 195 159 "%s_tp_kern.o", argv[0]); 196 - if (load_bpf_file(filename)) { 197 - printf("%s", bpf_log_buf); 198 - return 1; 199 - } 200 160 printf("w/TRACEPOINT\n"); 201 - run_perf_test(num_cpu, test_flags >> 4); 161 + err = load_progs(filename); 162 + if (!err) 163 + run_perf_test(num_cpu, test_flags >> 4); 164 + 202 165 unload_progs(); 203 166 } 204 167 205 168 if (test_flags & 0xC0) { 206 169 snprintf(filename, sizeof(filename), 207 170 "%s_raw_tp_kern.o", argv[0]); 208 - if (load_bpf_file(filename)) { 209 - printf("%s", bpf_log_buf); 210 - return 1; 211 - } 212 171 printf("w/RAW_TRACEPOINT\n"); 213 - run_perf_test(num_cpu, test_flags >> 6); 172 + err = load_progs(filename); 173 + if (!err) 174 + run_perf_test(num_cpu, test_flags >> 6); 175 + 214 176 unload_progs(); 215 177 } 216 178 217 - return 0; 179 + return err; 218 180 }

-2

samples/bpf/trace_event_user.c

··· 294 294 295 295 int main(int argc, char **argv) 296 296 { 297 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 298 297 struct bpf_object *obj = NULL; 299 298 char filename[256]; 300 299 int error = 1; 301 300 302 301 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 303 - setrlimit(RLIMIT_MEMLOCK, &r); 304 302 305 303 signal(SIGINT, err_exit); 306 304 signal(SIGTERM, err_exit);

-6

samples/bpf/tracex2_user.c

··· 116 116 117 117 int main(int ac, char **argv) 118 118 { 119 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 120 119 long key, next_key, value; 121 120 struct bpf_link *links[2]; 122 121 struct bpf_program *prog; ··· 123 124 char filename[256]; 124 125 int i, j = 0; 125 126 FILE *f; 126 - 127 - if (setrlimit(RLIMIT_MEMLOCK, &r)) { 128 - perror("setrlimit(RLIMIT_MEMLOCK)"); 129 - return 1; 130 - } 131 127 132 128 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 133 129 obj = bpf_object__open_file(filename, NULL);

-6

samples/bpf/tracex3_user.c

··· 107 107 108 108 int main(int ac, char **argv) 109 109 { 110 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 111 110 struct bpf_link *links[2]; 112 111 struct bpf_program *prog; 113 112 struct bpf_object *obj; ··· 124 125 " -t text only\n"); 125 126 return 1; 126 127 } 127 - } 128 - 129 - if (setrlimit(RLIMIT_MEMLOCK, &r)) { 130 - perror("setrlimit(RLIMIT_MEMLOCK)"); 131 - return 1; 132 128 } 133 129 134 130 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);

-6

samples/bpf/tracex4_user.c

··· 48 48 49 49 int main(int ac, char **argv) 50 50 { 51 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 52 51 struct bpf_link *links[2]; 53 52 struct bpf_program *prog; 54 53 struct bpf_object *obj; 55 54 char filename[256]; 56 55 int map_fd, i, j = 0; 57 - 58 - if (setrlimit(RLIMIT_MEMLOCK, &r)) { 59 - perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); 60 - return 1; 61 - } 62 56 63 57 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 64 58 obj = bpf_object__open_file(filename, NULL);

-3

samples/bpf/tracex5_user.c

··· 34 34 35 35 int main(int ac, char **argv) 36 36 { 37 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 38 37 struct bpf_link *link = NULL; 39 38 struct bpf_program *prog; 40 39 struct bpf_object *obj; ··· 41 42 const char *section; 42 43 char filename[256]; 43 44 FILE *f; 44 - 45 - setrlimit(RLIMIT_MEMLOCK, &r); 46 45 47 46 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 48 47 obj = bpf_object__open_file(filename, NULL);

-3

samples/bpf/tracex6_user.c

··· 175 175 176 176 int main(int argc, char **argv) 177 177 { 178 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 179 178 struct bpf_link *links[2]; 180 179 struct bpf_program *prog; 181 180 struct bpf_object *obj; 182 181 char filename[256]; 183 182 int i = 0; 184 - 185 - setrlimit(RLIMIT_MEMLOCK, &r); 186 183 187 184 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 188 185 obj = bpf_object__open_file(filename, NULL);

-6

samples/bpf/xdp1_user.c

··· 79 79 80 80 int main(int argc, char **argv) 81 81 { 82 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 83 82 struct bpf_prog_load_attr prog_load_attr = { 84 83 .prog_type = BPF_PROG_TYPE_XDP, 85 84 }; ··· 113 114 114 115 if (optind == argc) { 115 116 usage(basename(argv[0])); 116 - return 1; 117 - } 118 - 119 - if (setrlimit(RLIMIT_MEMLOCK, &r)) { 120 - perror("setrlimit(RLIMIT_MEMLOCK)"); 121 117 return 1; 122 118 } 123 119

+1 -1

samples/bpf/xdp2skb_meta_kern.c

··· 6 6 * This uses the XDP data_meta infrastructure, and is a cooperation 7 7 * between two bpf-programs (1) XDP and (2) clsact at TC-ingress hook. 8 8 * 9 - * Notice: This example does not use the BPF C-loader (bpf_load.c), 9 + * Notice: This example does not use the BPF C-loader, 10 10 * but instead rely on the iproute2 TC tool for loading BPF-objects. 11 11 */ 12 12 #include <uapi/linux/bpf.h>

-6

samples/bpf/xdp_adjust_tail_user.c

··· 82 82 83 83 int main(int argc, char **argv) 84 84 { 85 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 86 85 struct bpf_prog_load_attr prog_load_attr = { 87 86 .prog_type = BPF_PROG_TYPE_XDP, 88 87 }; ··· 140 141 usage(argv[0]); 141 142 return 1; 142 143 } 143 - } 144 - 145 - if (setrlimit(RLIMIT_MEMLOCK, &r)) { 146 - perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); 147 - return 1; 148 144 } 149 145 150 146 if (!ifindex) {

-5

samples/bpf/xdp_monitor_user.c

··· 687 687 688 688 int main(int argc, char **argv) 689 689 { 690 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 691 690 struct bpf_program *prog; 692 691 int longindex = 0, opt; 693 692 int ret = EXIT_FAILURE; ··· 718 719 } 719 720 720 721 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 721 - if (setrlimit(RLIMIT_MEMLOCK, &r)) { 722 - perror("setrlimit(RLIMIT_MEMLOCK)"); 723 - return ret; 724 - } 725 722 726 723 /* Remove tracepoint program when program is interrupted or killed */ 727 724 signal(SIGINT, int_exit);

-6

samples/bpf/xdp_redirect_cpu_user.c

··· 765 765 766 766 int main(int argc, char **argv) 767 767 { 768 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 769 768 char *prog_name = "xdp_cpu_map5_lb_hash_ip_pairs"; 770 769 char *mprog_filename = "xdp_redirect_kern.o"; 771 770 char *redir_interface = NULL, *redir_map = NULL; ··· 802 803 803 804 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 804 805 prog_load_attr.file = filename; 805 - 806 - if (setrlimit(RLIMIT_MEMLOCK, &r)) { 807 - perror("setrlimit(RLIMIT_MEMLOCK)"); 808 - return 1; 809 - } 810 806 811 807 if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) 812 808 return err;

-6

samples/bpf/xdp_redirect_map_user.c

··· 96 96 97 97 int main(int argc, char **argv) 98 98 { 99 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 100 99 struct bpf_prog_load_attr prog_load_attr = { 101 100 .prog_type = BPF_PROG_TYPE_XDP, 102 101 }; ··· 131 132 132 133 if (optind == argc) { 133 134 printf("usage: %s <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n", argv[0]); 134 - return 1; 135 - } 136 - 137 - if (setrlimit(RLIMIT_MEMLOCK, &r)) { 138 - perror("setrlimit(RLIMIT_MEMLOCK)"); 139 135 return 1; 140 136 } 141 137

-6

samples/bpf/xdp_redirect_user.c

··· 97 97 98 98 int main(int argc, char **argv) 99 99 { 100 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 101 100 struct bpf_prog_load_attr prog_load_attr = { 102 101 .prog_type = BPF_PROG_TYPE_XDP, 103 102 }; ··· 132 133 133 134 if (optind == argc) { 134 135 printf("usage: %s <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n", argv[0]); 135 - return 1; 136 - } 137 - 138 - if (setrlimit(RLIMIT_MEMLOCK, &r)) { 139 - perror("setrlimit(RLIMIT_MEMLOCK)"); 140 136 return 1; 141 137 } 142 138

-6

samples/bpf/xdp_router_ipv4_user.c

··· 625 625 626 626 int main(int ac, char **argv) 627 627 { 628 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 629 628 struct bpf_prog_load_attr prog_load_attr = { 630 629 .prog_type = BPF_PROG_TYPE_XDP, 631 630 }; ··· 666 667 667 668 if (optind == ac) { 668 669 usage(basename(argv[0])); 669 - return 1; 670 - } 671 - 672 - if (setrlimit(RLIMIT_MEMLOCK, &r)) { 673 - perror("setrlimit(RLIMIT_MEMLOCK)"); 674 670 return 1; 675 671 } 676 672

-6

samples/bpf/xdp_rxq_info_user.c

··· 450 450 int main(int argc, char **argv) 451 451 { 452 452 __u32 cfg_options= NO_TOUCH ; /* Default: Don't touch packet memory */ 453 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 454 453 struct bpf_prog_load_attr prog_load_attr = { 455 454 .prog_type = BPF_PROG_TYPE_XDP, 456 455 }; ··· 472 473 473 474 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 474 475 prog_load_attr.file = filename; 475 - 476 - if (setrlimit(RLIMIT_MEMLOCK, &r)) { 477 - perror("setrlimit(RLIMIT_MEMLOCK)"); 478 - return 1; 479 - } 480 476 481 477 if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) 482 478 return EXIT_FAIL;

-6

samples/bpf/xdp_sample_pkts_user.c

··· 109 109 110 110 int main(int argc, char **argv) 111 111 { 112 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 113 112 struct bpf_prog_load_attr prog_load_attr = { 114 113 .prog_type = BPF_PROG_TYPE_XDP, 115 114 }; ··· 139 140 140 141 if (optind == argc) { 141 142 usage(basename(argv[0])); 142 - return 1; 143 - } 144 - 145 - if (setrlimit(RLIMIT_MEMLOCK, &r)) { 146 - perror("setrlimit(RLIMIT_MEMLOCK)"); 147 143 return 1; 148 144 } 149 145

-6

samples/bpf/xdp_tx_iptunnel_user.c

··· 155 155 struct bpf_prog_load_attr prog_load_attr = { 156 156 .prog_type = BPF_PROG_TYPE_XDP, 157 157 }; 158 - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 159 158 int min_port = 0, max_port = 0, vip2tnl_map_fd; 160 159 const char *optstr = "i:a:p:s:d:m:T:P:FSNh"; 161 160 unsigned char opt_flags[256] = {}; ··· 251 252 usage(argv[0]); 252 253 return 1; 253 254 } 254 - } 255 - 256 - if (setrlimit(RLIMIT_MEMLOCK, &r)) { 257 - perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); 258 - return 1; 259 255 } 260 256 261 257 if (!ifindex) {

+8

samples/bpf/xdpsock.h

··· 8 8 9 9 #define MAX_SOCKS 4 10 10 11 + #define SOCKET_NAME "sock_cal_bpf_fd" 12 + #define MAX_NUM_OF_CLIENTS 10 13 + 14 + #define CLOSE_CONN 1 15 + 16 + typedef __u64 u64; 17 + typedef __u32 u32; 18 + 11 19 #endif /* XDPSOCK_H */

+187

samples/bpf/xdpsock_ctrl_proc.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright(c) 2017 - 2018 Intel Corporation. */ 3 + 4 + #include <errno.h> 5 + #include <getopt.h> 6 + #include <libgen.h> 7 + #include <net/if.h> 8 + #include <stdio.h> 9 + #include <stdlib.h> 10 + #include <sys/socket.h> 11 + #include <sys/un.h> 12 + #include <unistd.h> 13 + 14 + #include <bpf/bpf.h> 15 + #include <bpf/xsk.h> 16 + #include "xdpsock.h" 17 + 18 + static const char *opt_if = ""; 19 + 20 + static struct option long_options[] = { 21 + {"interface", required_argument, 0, 'i'}, 22 + {0, 0, 0, 0} 23 + }; 24 + 25 + static void usage(const char *prog) 26 + { 27 + const char *str = 28 + " Usage: %s [OPTIONS]\n" 29 + " Options:\n" 30 + " -i, --interface=n Run on interface n\n" 31 + "\n"; 32 + fprintf(stderr, "%s\n", str); 33 + 34 + exit(0); 35 + } 36 + 37 + static void parse_command_line(int argc, char **argv) 38 + { 39 + int option_index, c; 40 + 41 + opterr = 0; 42 + 43 + for (;;) { 44 + c = getopt_long(argc, argv, "i:", 45 + long_options, &option_index); 46 + if (c == -1) 47 + break; 48 + 49 + switch (c) { 50 + case 'i': 51 + opt_if = optarg; 52 + break; 53 + default: 54 + usage(basename(argv[0])); 55 + } 56 + } 57 + } 58 + 59 + static int send_xsks_map_fd(int sock, int fd) 60 + { 61 + char cmsgbuf[CMSG_SPACE(sizeof(int))]; 62 + struct msghdr msg; 63 + struct iovec iov; 64 + int value = 0; 65 + 66 + if (fd == -1) { 67 + fprintf(stderr, "Incorrect fd = %d\n", fd); 68 + return -1; 69 + } 70 + iov.iov_base = &value; 71 + iov.iov_len = sizeof(int); 72 + 73 + msg.msg_name = NULL; 74 + msg.msg_namelen = 0; 75 + msg.msg_iov = &iov; 76 + msg.msg_iovlen = 1; 77 + msg.msg_flags = 0; 78 + msg.msg_control = cmsgbuf; 79 + msg.msg_controllen = CMSG_LEN(sizeof(int)); 80 + 81 + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); 82 + 83 + cmsg->cmsg_level = SOL_SOCKET; 84 + cmsg->cmsg_type = SCM_RIGHTS; 85 + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); 86 + 87 + *(int *)CMSG_DATA(cmsg) = fd; 88 + int ret = sendmsg(sock, &msg, 0); 89 + 90 + if (ret == -1) { 91 + fprintf(stderr, "Sendmsg failed with %s", strerror(errno)); 92 + return -errno; 93 + } 94 + 95 + return ret; 96 + } 97 + 98 + int 99 + main(int argc, char **argv) 100 + { 101 + struct sockaddr_un server; 102 + int listening = 1; 103 + int rval, msgsock; 104 + int ifindex = 0; 105 + int flag = 1; 106 + int cmd = 0; 107 + int sock; 108 + int err; 109 + int xsks_map_fd; 110 + 111 + parse_command_line(argc, argv); 112 + 113 + ifindex = if_nametoindex(opt_if); 114 + if (ifindex == 0) { 115 + fprintf(stderr, "Unable to get ifindex for Interface %s. Reason:%s", 116 + opt_if, strerror(errno)); 117 + return -errno; 118 + } 119 + 120 + sock = socket(AF_UNIX, SOCK_STREAM, 0); 121 + if (sock < 0) { 122 + fprintf(stderr, "Opening socket stream failed: %s", strerror(errno)); 123 + return -errno; 124 + } 125 + 126 + server.sun_family = AF_UNIX; 127 + strcpy(server.sun_path, SOCKET_NAME); 128 + 129 + setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &flag, sizeof(int)); 130 + 131 + if (bind(sock, (struct sockaddr *)&server, sizeof(struct sockaddr_un))) { 132 + fprintf(stderr, "Binding to socket stream failed: %s", strerror(errno)); 133 + return -errno; 134 + } 135 + 136 + listen(sock, MAX_NUM_OF_CLIENTS); 137 + 138 + err = xsk_setup_xdp_prog(ifindex, &xsks_map_fd); 139 + if (err) { 140 + fprintf(stderr, "Setup of xdp program failed\n"); 141 + goto close_sock; 142 + } 143 + 144 + while (listening) { 145 + msgsock = accept(sock, 0, 0); 146 + if (msgsock == -1) { 147 + fprintf(stderr, "Error accepting connection: %s", strerror(errno)); 148 + err = -errno; 149 + goto close_sock; 150 + } 151 + err = send_xsks_map_fd(msgsock, xsks_map_fd); 152 + if (err <= 0) { 153 + fprintf(stderr, "Error %d sending xsks_map_fd\n", err); 154 + goto cleanup; 155 + } 156 + do { 157 + rval = read(msgsock, &cmd, sizeof(int)); 158 + if (rval < 0) { 159 + fprintf(stderr, "Error reading stream message"); 160 + } else { 161 + if (cmd != CLOSE_CONN) 162 + fprintf(stderr, "Recv unknown cmd = %d\n", cmd); 163 + listening = 0; 164 + break; 165 + } 166 + } while (rval > 0); 167 + } 168 + close(msgsock); 169 + close(sock); 170 + unlink(SOCKET_NAME); 171 + 172 + /* Unset fd for given ifindex */ 173 + err = bpf_set_link_xdp_fd(ifindex, -1, 0); 174 + if (err) { 175 + fprintf(stderr, "Error when unsetting bpf prog_fd for ifindex(%d)\n", ifindex); 176 + return err; 177 + } 178 + 179 + return 0; 180 + 181 + cleanup: 182 + close(msgsock); 183 + close_sock: 184 + close(sock); 185 + unlink(SOCKET_NAME); 186 + return err; 187 + }

+193 -37

samples/bpf/xdpsock_user.c

··· 24 24 #include <stdio.h> 25 25 #include <stdlib.h> 26 26 #include <string.h> 27 + #include <sys/capability.h> 27 28 #include <sys/mman.h> 28 29 #include <sys/resource.h> 29 30 #include <sys/socket.h> 30 31 #include <sys/types.h> 32 + #include <sys/un.h> 31 33 #include <time.h> 32 34 #include <unistd.h> 33 35 ··· 97 95 static bool opt_need_wakeup = true; 98 96 static u32 opt_num_xsks = 1; 99 97 static u32 prog_id; 98 + static bool opt_busy_poll; 99 + static bool opt_reduced_cap; 100 100 101 101 struct xsk_ring_stats { 102 102 unsigned long rx_npkts; ··· 157 153 158 154 static int num_socks; 159 155 struct xsk_socket_info *xsks[MAX_SOCKS]; 156 + int sock; 160 157 161 158 static unsigned long get_nsecs(void) 162 159 { ··· 465 460 static void remove_xdp_program(void) 466 461 { 467 462 u32 curr_prog_id = 0; 463 + int cmd = CLOSE_CONN; 468 464 469 465 if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) { 470 466 printf("bpf_get_link_xdp_id failed\n"); ··· 477 471 printf("couldn't find a prog id on a given interface\n"); 478 472 else 479 473 printf("program on interface changed, not removing\n"); 474 + 475 + if (opt_reduced_cap) { 476 + if (write(sock, &cmd, sizeof(int)) < 0) { 477 + fprintf(stderr, "Error writing into stream socket: %s", strerror(errno)); 478 + exit(EXIT_FAILURE); 479 + } 480 + } 480 481 } 481 482 482 483 static void int_exit(int sig) ··· 866 853 xsk->umem = umem; 867 854 cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; 868 855 cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; 869 - if (opt_num_xsks > 1) 856 + if (opt_num_xsks > 1 || opt_reduced_cap) 870 857 cfg.libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD; 871 858 else 872 859 cfg.libbpf_flags = 0; ··· 924 911 {"quiet", no_argument, 0, 'Q'}, 925 912 {"app-stats", no_argument, 0, 'a'}, 926 913 {"irq-string", no_argument, 0, 'I'}, 914 + {"busy-poll", no_argument, 0, 'B'}, 915 + {"reduce-cap", no_argument, 0, 'R'}, 927 916 {0, 0, 0, 0} 928 917 }; 929 918 ··· 948 933 " -m, --no-need-wakeup Turn off use of driver need wakeup flag.\n" 949 934 " -f, --frame-size=n Set the frame size (must be a power of two in aligned mode, default is %d).\n" 950 935 " -u, --unaligned Enable unaligned chunk placement\n" 951 - " -M, --shared-umem Enable XDP_SHARED_UMEM\n" 936 + " -M, --shared-umem Enable XDP_SHARED_UMEM (cannot be used with -R)\n" 952 937 " -F, --force Force loading the XDP prog\n" 953 938 " -d, --duration=n Duration in secs to run command.\n" 954 939 " Default: forever.\n" ··· 964 949 " -Q, --quiet Do not display any stats.\n" 965 950 " -a, --app-stats Display application (syscall) statistics.\n" 966 951 " -I, --irq-string Display driver interrupt statistics for interface associated with irq-string.\n" 952 + " -B, --busy-poll Busy poll.\n" 953 + " -R, --reduce-cap Use reduced capabilities (cannot be used with -M)\n" 967 954 "\n"; 968 955 fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE, 969 956 opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE, ··· 981 964 opterr = 0; 982 965 983 966 for (;;) { 984 - c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQaI:", 967 + c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQaI:BR", 985 968 long_options, &option_index); 986 969 if (c == -1) 987 970 break; ··· 1079 1062 fprintf(stderr, "ERROR: Failed to get irqs for %s\n", opt_irq_str); 1080 1063 usage(basename(argv[0])); 1081 1064 } 1082 - 1065 + break; 1066 + case 'B': 1067 + opt_busy_poll = 1; 1068 + break; 1069 + case 'R': 1070 + opt_reduced_cap = true; 1083 1071 break; 1084 1072 default: 1085 1073 usage(basename(argv[0])); ··· 1107 1085 opt_xsk_frame_size); 1108 1086 usage(basename(argv[0])); 1109 1087 } 1088 + 1089 + if (opt_reduced_cap && opt_num_xsks > 1) { 1090 + fprintf(stderr, "ERROR: -M and -R cannot be used together\n"); 1091 + usage(basename(argv[0])); 1092 + } 1110 1093 } 1111 1094 1112 1095 static void kick_tx(struct xsk_socket_info *xsk) ··· 1125 1098 exit_with_error(errno); 1126 1099 } 1127 1100 1128 - static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, 1129 - struct pollfd *fds) 1101 + static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk) 1130 1102 { 1131 1103 struct xsk_umem_info *umem = xsk->umem; 1132 1104 u32 idx_cq = 0, idx_fq = 0; ··· 1158 1132 while (ret != rcvd) { 1159 1133 if (ret < 0) 1160 1134 exit_with_error(-ret); 1161 - if (xsk_ring_prod__needs_wakeup(&umem->fq)) { 1135 + if (opt_busy_poll || xsk_ring_prod__needs_wakeup(&umem->fq)) { 1162 1136 xsk->app_stats.fill_fail_polls++; 1163 - ret = poll(fds, num_socks, opt_timeout); 1137 + recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 1138 + NULL); 1164 1139 } 1165 1140 ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); 1166 1141 } ··· 1173 1146 xsk_ring_prod__submit(&xsk->umem->fq, rcvd); 1174 1147 xsk_ring_cons__release(&xsk->umem->cq, rcvd); 1175 1148 xsk->outstanding_tx -= rcvd; 1176 - xsk->ring_stats.tx_npkts += rcvd; 1177 1149 } 1178 1150 } 1179 1151 ··· 1194 1168 if (rcvd > 0) { 1195 1169 xsk_ring_cons__release(&xsk->umem->cq, rcvd); 1196 1170 xsk->outstanding_tx -= rcvd; 1197 - xsk->ring_stats.tx_npkts += rcvd; 1198 1171 } 1199 1172 } 1200 1173 1201 - static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds) 1174 + static void rx_drop(struct xsk_socket_info *xsk) 1202 1175 { 1203 1176 unsigned int rcvd, i; 1204 1177 u32 idx_rx = 0, idx_fq = 0; ··· 1205 1180 1206 1181 rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); 1207 1182 if (!rcvd) { 1208 - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { 1183 + if (opt_busy_poll || xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { 1209 1184 xsk->app_stats.rx_empty_polls++; 1210 - ret = poll(fds, num_socks, opt_timeout); 1185 + recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); 1211 1186 } 1212 1187 return; 1213 1188 } ··· 1216 1191 while (ret != rcvd) { 1217 1192 if (ret < 0) 1218 1193 exit_with_error(-ret); 1219 - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { 1194 + if (opt_busy_poll || xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { 1220 1195 xsk->app_stats.fill_fail_polls++; 1221 - ret = poll(fds, num_socks, opt_timeout); 1196 + recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); 1222 1197 } 1223 1198 ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); 1224 1199 } ··· 1260 1235 } 1261 1236 1262 1237 for (i = 0; i < num_socks; i++) 1263 - rx_drop(xsks[i], fds); 1238 + rx_drop(xsks[i]); 1264 1239 1265 1240 if (benchmark_done) 1266 1241 break; ··· 1285 1260 } 1286 1261 1287 1262 xsk_ring_prod__submit(&xsk->tx, batch_size); 1263 + xsk->ring_stats.tx_npkts += batch_size; 1288 1264 xsk->outstanding_tx += batch_size; 1289 1265 *frame_nb += batch_size; 1290 1266 *frame_nb %= NUM_FRAMES; ··· 1358 1332 complete_tx_only_all(); 1359 1333 } 1360 1334 1361 - static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) 1335 + static void l2fwd(struct xsk_socket_info *xsk) 1362 1336 { 1363 1337 unsigned int rcvd, i; 1364 1338 u32 idx_rx = 0, idx_tx = 0; 1365 1339 int ret; 1366 1340 1367 - complete_tx_l2fwd(xsk, fds); 1341 + complete_tx_l2fwd(xsk); 1368 1342 1369 1343 rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); 1370 1344 if (!rcvd) { 1371 - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { 1345 + if (opt_busy_poll || xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { 1372 1346 xsk->app_stats.rx_empty_polls++; 1373 - ret = poll(fds, num_socks, opt_timeout); 1347 + recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); 1374 1348 } 1375 1349 return; 1376 1350 } 1351 + xsk->ring_stats.rx_npkts += rcvd; 1377 1352 1378 1353 ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); 1379 1354 while (ret != rcvd) { 1380 1355 if (ret < 0) 1381 1356 exit_with_error(-ret); 1382 - complete_tx_l2fwd(xsk, fds); 1383 - if (xsk_ring_prod__needs_wakeup(&xsk->tx)) { 1357 + complete_tx_l2fwd(xsk); 1358 + if (opt_busy_poll || xsk_ring_prod__needs_wakeup(&xsk->tx)) { 1384 1359 xsk->app_stats.tx_wakeup_sendtos++; 1385 1360 kick_tx(xsk); 1386 1361 } ··· 1406 1379 xsk_ring_prod__submit(&xsk->tx, rcvd); 1407 1380 xsk_ring_cons__release(&xsk->rx, rcvd); 1408 1381 1409 - xsk->ring_stats.rx_npkts += rcvd; 1382 + xsk->ring_stats.tx_npkts += rcvd; 1410 1383 xsk->outstanding_tx += rcvd; 1411 1384 } 1412 1385 ··· 1415 1388 struct pollfd fds[MAX_SOCKS] = {}; 1416 1389 int i, ret; 1417 1390 1418 - for (i = 0; i < num_socks; i++) { 1419 - fds[i].fd = xsk_socket__fd(xsks[i]->xsk); 1420 - fds[i].events = POLLOUT | POLLIN; 1421 - } 1422 - 1423 1391 for (;;) { 1424 1392 if (opt_poll) { 1425 - for (i = 0; i < num_socks; i++) 1393 + for (i = 0; i < num_socks; i++) { 1394 + fds[i].fd = xsk_socket__fd(xsks[i]->xsk); 1395 + fds[i].events = POLLOUT | POLLIN; 1426 1396 xsks[i]->app_stats.opt_polls++; 1397 + } 1427 1398 ret = poll(fds, num_socks, opt_timeout); 1428 1399 if (ret <= 0) 1429 1400 continue; 1430 1401 } 1431 1402 1432 1403 for (i = 0; i < num_socks; i++) 1433 - l2fwd(xsks[i], fds); 1404 + l2fwd(xsks[i]); 1434 1405 1435 1406 if (benchmark_done) 1436 1407 break; ··· 1486 1461 } 1487 1462 } 1488 1463 1464 + static void apply_setsockopt(struct xsk_socket_info *xsk) 1465 + { 1466 + int sock_opt; 1467 + 1468 + if (!opt_busy_poll) 1469 + return; 1470 + 1471 + sock_opt = 1; 1472 + if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_PREFER_BUSY_POLL, 1473 + (void *)&sock_opt, sizeof(sock_opt)) < 0) 1474 + exit_with_error(errno); 1475 + 1476 + sock_opt = 20; 1477 + if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL, 1478 + (void *)&sock_opt, sizeof(sock_opt)) < 0) 1479 + exit_with_error(errno); 1480 + 1481 + sock_opt = opt_batch_size; 1482 + if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL_BUDGET, 1483 + (void *)&sock_opt, sizeof(sock_opt)) < 0) 1484 + exit_with_error(errno); 1485 + } 1486 + 1487 + static int recv_xsks_map_fd_from_ctrl_node(int sock, int *_fd) 1488 + { 1489 + char cms[CMSG_SPACE(sizeof(int))]; 1490 + struct cmsghdr *cmsg; 1491 + struct msghdr msg; 1492 + struct iovec iov; 1493 + int value; 1494 + int len; 1495 + 1496 + iov.iov_base = &value; 1497 + iov.iov_len = sizeof(int); 1498 + 1499 + msg.msg_name = 0; 1500 + msg.msg_namelen = 0; 1501 + msg.msg_iov = &iov; 1502 + msg.msg_iovlen = 1; 1503 + msg.msg_flags = 0; 1504 + msg.msg_control = (caddr_t)cms; 1505 + msg.msg_controllen = sizeof(cms); 1506 + 1507 + len = recvmsg(sock, &msg, 0); 1508 + 1509 + if (len < 0) { 1510 + fprintf(stderr, "Recvmsg failed length incorrect.\n"); 1511 + return -EINVAL; 1512 + } 1513 + 1514 + if (len == 0) { 1515 + fprintf(stderr, "Recvmsg failed no data\n"); 1516 + return -EINVAL; 1517 + } 1518 + 1519 + cmsg = CMSG_FIRSTHDR(&msg); 1520 + *_fd = *(int *)CMSG_DATA(cmsg); 1521 + 1522 + return 0; 1523 + } 1524 + 1525 + static int 1526 + recv_xsks_map_fd(int *xsks_map_fd) 1527 + { 1528 + struct sockaddr_un server; 1529 + int err; 1530 + 1531 + sock = socket(AF_UNIX, SOCK_STREAM, 0); 1532 + if (sock < 0) { 1533 + fprintf(stderr, "Error opening socket stream: %s", strerror(errno)); 1534 + return errno; 1535 + } 1536 + 1537 + server.sun_family = AF_UNIX; 1538 + strcpy(server.sun_path, SOCKET_NAME); 1539 + 1540 + if (connect(sock, (struct sockaddr *)&server, sizeof(struct sockaddr_un)) < 0) { 1541 + close(sock); 1542 + fprintf(stderr, "Error connecting stream socket: %s", strerror(errno)); 1543 + return errno; 1544 + } 1545 + 1546 + err = recv_xsks_map_fd_from_ctrl_node(sock, xsks_map_fd); 1547 + if (err) { 1548 + fprintf(stderr, "Error %d receiving fd\n", err); 1549 + return err; 1550 + } 1551 + return 0; 1552 + } 1553 + 1489 1554 int main(int argc, char **argv) 1490 1555 { 1556 + struct __user_cap_header_struct hdr = { _LINUX_CAPABILITY_VERSION_3, 0 }; 1557 + struct __user_cap_data_struct data[2] = { { 0 } }; 1491 1558 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 1492 1559 bool rx = false, tx = false; 1493 1560 struct xsk_umem_info *umem; 1494 1561 struct bpf_object *obj; 1562 + int xsks_map_fd = 0; 1495 1563 pthread_t pt; 1496 1564 int i, ret; 1497 1565 void *bufs; 1498 1566 1499 1567 parse_command_line(argc, argv); 1500 1568 1501 - if (setrlimit(RLIMIT_MEMLOCK, &r)) { 1502 - fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n", 1503 - strerror(errno)); 1504 - exit(EXIT_FAILURE); 1505 - } 1569 + if (opt_reduced_cap) { 1570 + if (capget(&hdr, data) < 0) 1571 + fprintf(stderr, "Error getting capabilities\n"); 1506 1572 1507 - if (opt_num_xsks > 1) 1508 - load_xdp_program(argv, &obj); 1573 + data->effective &= CAP_TO_MASK(CAP_NET_RAW); 1574 + data->permitted &= CAP_TO_MASK(CAP_NET_RAW); 1575 + 1576 + if (capset(&hdr, data) < 0) 1577 + fprintf(stderr, "Setting capabilities failed\n"); 1578 + 1579 + if (capget(&hdr, data) < 0) { 1580 + fprintf(stderr, "Error getting capabilities\n"); 1581 + } else { 1582 + fprintf(stderr, "Capabilities EFF %x Caps INH %x Caps Per %x\n", 1583 + data[0].effective, data[0].inheritable, data[0].permitted); 1584 + fprintf(stderr, "Capabilities EFF %x Caps INH %x Caps Per %x\n", 1585 + data[1].effective, data[1].inheritable, data[1].permitted); 1586 + } 1587 + } else { 1588 + if (setrlimit(RLIMIT_MEMLOCK, &r)) { 1589 + fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n", 1590 + strerror(errno)); 1591 + exit(EXIT_FAILURE); 1592 + } 1593 + 1594 + if (opt_num_xsks > 1) 1595 + load_xdp_program(argv, &obj); 1596 + } 1509 1597 1510 1598 /* Reserve memory for the umem. Use hugepages if unaligned chunk mode */ 1511 1599 bufs = mmap(NULL, NUM_FRAMES * opt_xsk_frame_size, ··· 1640 1502 for (i = 0; i < opt_num_xsks; i++) 1641 1503 xsks[num_socks++] = xsk_configure_socket(umem, rx, tx); 1642 1504 1505 + for (i = 0; i < opt_num_xsks; i++) 1506 + apply_setsockopt(xsks[i]); 1507 + 1643 1508 if (opt_bench == BENCH_TXONLY) { 1644 1509 gen_eth_hdr_data(); 1645 1510 ··· 1652 1511 1653 1512 if (opt_num_xsks > 1 && opt_bench != BENCH_TXONLY) 1654 1513 enter_xsks_into_map(obj); 1514 + 1515 + if (opt_reduced_cap) { 1516 + ret = recv_xsks_map_fd(&xsks_map_fd); 1517 + if (ret) { 1518 + fprintf(stderr, "Error %d receiving xsks_map_fd\n", ret); 1519 + exit_with_error(ret); 1520 + } 1521 + if (xsks[0]->xsk) { 1522 + ret = xsk_socket__update_xskmap(xsks[0]->xsk, xsks_map_fd); 1523 + if (ret) { 1524 + fprintf(stderr, "Update of BPF map failed(%d)\n", ret); 1525 + exit_with_error(ret); 1526 + } 1527 + } 1528 + } 1655 1529 1656 1530 signal(SIGINT, int_exit); 1657 1531 signal(SIGTERM, int_exit);

+7 -2

scripts/Makefile.modfinal

··· 38 38 $(if $(ARCH_POSTLINK), $(MAKE) -f $(ARCH_POSTLINK) $@, true) 39 39 40 40 quiet_cmd_btf_ko = BTF [M] $@ 41 - cmd_btf_ko = LLVM_OBJCOPY=$(OBJCOPY) $(PAHOLE) -J --btf_base vmlinux $@ 41 + cmd_btf_ko = \ 42 + if [ -f vmlinux ]; then \ 43 + LLVM_OBJCOPY=$(OBJCOPY) $(PAHOLE) -J --btf_base vmlinux $@; \ 44 + else \ 45 + printf "Skipping BTF generation for %s due to unavailability of vmlinux\n" $@ 1>&2; \ 46 + fi; 42 47 43 48 # Same as newer-prereqs, but allows to exclude specified extra dependencies 44 49 newer_prereqs_except = $(filter-out $(PHONY) $(1),$?) ··· 54 49 printf '%s\n' 'cmd_$@ := $(make-cmd)' > $(dot-target).cmd, @:) 55 50 56 51 # Re-generate module BTFs if either module's .ko or vmlinux changed 57 - $(modules): %.ko: %.o %.mod.o scripts/module.lds vmlinux FORCE 52 + $(modules): %.ko: %.o %.mod.o scripts/module.lds $(if $(KBUILD_BUILTIN),vmlinux) FORCE 58 53 +$(call if_changed_except,ld_ko_o,vmlinux) 59 54 ifdef CONFIG_DEBUG_INFO_BTF_MODULES 60 55 +$(if $(newer-prereqs),$(call cmd,btf_ko))

+4

scripts/bpf_helpers_doc.py

··· 418 418 'struct bpf_tcp_sock', 419 419 'struct bpf_tunnel_key', 420 420 'struct bpf_xfrm_state', 421 + 'struct linux_binprm', 421 422 'struct pt_regs', 422 423 'struct sk_reuseport_md', 423 424 'struct sockaddr', ··· 436 435 'struct xdp_md', 437 436 'struct path', 438 437 'struct btf_ptr', 438 + 'struct inode', 439 439 ] 440 440 known_types = { 441 441 '...', ··· 467 465 'struct bpf_tcp_sock', 468 466 'struct bpf_tunnel_key', 469 467 'struct bpf_xfrm_state', 468 + 'struct linux_binprm', 470 469 'struct pt_regs', 471 470 'struct sk_reuseport_md', 472 471 'struct sockaddr', ··· 481 478 'struct task_struct', 482 479 'struct path', 483 480 'struct btf_ptr', 481 + 'struct inode', 484 482 } 485 483 mapped_types = { 486 484 'u8': '__u8',

+54 -24

security/integrity/ima/ima_main.c

··· 501 501 } 502 502 EXPORT_SYMBOL_GPL(ima_file_check); 503 503 504 - /** 505 - * ima_file_hash - return the stored measurement if a file has been hashed and 506 - * is in the iint cache. 507 - * @file: pointer to the file 508 - * @buf: buffer in which to store the hash 509 - * @buf_size: length of the buffer 510 - * 511 - * On success, return the hash algorithm (as defined in the enum hash_algo). 512 - * If buf is not NULL, this function also outputs the hash into buf. 513 - * If the hash is larger than buf_size, then only buf_size bytes will be copied. 514 - * It generally just makes sense to pass a buffer capable of holding the largest 515 - * possible hash: IMA_MAX_DIGEST_SIZE. 516 - * The file hash returned is based on the entire file, including the appended 517 - * signature. 518 - * 519 - * If IMA is disabled or if no measurement is available, return -EOPNOTSUPP. 520 - * If the parameters are incorrect, return -EINVAL. 521 - */ 522 - int ima_file_hash(struct file *file, char *buf, size_t buf_size) 504 + static int __ima_inode_hash(struct inode *inode, char *buf, size_t buf_size) 523 505 { 524 - struct inode *inode; 525 506 struct integrity_iint_cache *iint; 526 507 int hash_algo; 527 - 528 - if (!file) 529 - return -EINVAL; 530 508 531 509 if (!ima_policy_flag) 532 510 return -EOPNOTSUPP; 533 511 534 - inode = file_inode(file); 535 512 iint = integrity_iint_find(inode); 536 513 if (!iint) 537 514 return -EOPNOTSUPP; ··· 535 558 536 559 return hash_algo; 537 560 } 561 + 562 + /** 563 + * ima_file_hash - return the stored measurement if a file has been hashed and 564 + * is in the iint cache. 565 + * @file: pointer to the file 566 + * @buf: buffer in which to store the hash 567 + * @buf_size: length of the buffer 568 + * 569 + * On success, return the hash algorithm (as defined in the enum hash_algo). 570 + * If buf is not NULL, this function also outputs the hash into buf. 571 + * If the hash is larger than buf_size, then only buf_size bytes will be copied. 572 + * It generally just makes sense to pass a buffer capable of holding the largest 573 + * possible hash: IMA_MAX_DIGEST_SIZE. 574 + * The file hash returned is based on the entire file, including the appended 575 + * signature. 576 + * 577 + * If IMA is disabled or if no measurement is available, return -EOPNOTSUPP. 578 + * If the parameters are incorrect, return -EINVAL. 579 + */ 580 + int ima_file_hash(struct file *file, char *buf, size_t buf_size) 581 + { 582 + if (!file) 583 + return -EINVAL; 584 + 585 + return __ima_inode_hash(file_inode(file), buf, buf_size); 586 + } 538 587 EXPORT_SYMBOL_GPL(ima_file_hash); 588 + 589 + /** 590 + * ima_inode_hash - return the stored measurement if the inode has been hashed 591 + * and is in the iint cache. 592 + * @inode: pointer to the inode 593 + * @buf: buffer in which to store the hash 594 + * @buf_size: length of the buffer 595 + * 596 + * On success, return the hash algorithm (as defined in the enum hash_algo). 597 + * If buf is not NULL, this function also outputs the hash into buf. 598 + * If the hash is larger than buf_size, then only buf_size bytes will be copied. 599 + * It generally just makes sense to pass a buffer capable of holding the largest 600 + * possible hash: IMA_MAX_DIGEST_SIZE. 601 + * The hash returned is based on the entire contents, including the appended 602 + * signature. 603 + * 604 + * If IMA is disabled or if no measurement is available, return -EOPNOTSUPP. 605 + * If the parameters are incorrect, return -EINVAL. 606 + */ 607 + int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size) 608 + { 609 + if (!inode) 610 + return -EINVAL; 611 + 612 + return __ima_inode_hash(inode, buf, buf_size); 613 + } 614 + EXPORT_SYMBOL_GPL(ima_inode_hash); 539 615 540 616 /** 541 617 * ima_post_create_tmpfile - mark newly created tmpfile as new

+23 -4

tools/bpf/bpftool/btf.c

··· 357 357 dump_btf_type(btf, root_type_ids[i], t); 358 358 } 359 359 } else { 360 + const struct btf *base; 360 361 int cnt = btf__get_nr_types(btf); 361 362 int start_id = 1; 362 363 363 - if (base_btf) 364 - start_id = btf__get_nr_types(base_btf) + 1; 364 + base = btf__base_btf(btf); 365 + if (base) 366 + start_id = btf__get_nr_types(base) + 1; 365 367 366 368 for (i = start_id; i <= cnt; i++) { 367 369 t = btf__type_by_id(btf, i); ··· 430 428 431 429 static int do_dump(int argc, char **argv) 432 430 { 433 - struct btf *btf = NULL; 431 + struct btf *btf = NULL, *base = NULL; 434 432 __u32 root_type_ids[2]; 435 433 int root_type_cnt = 0; 436 434 bool dump_c = false; ··· 504 502 } 505 503 NEXT_ARG(); 506 504 } else if (is_prefix(src, "file")) { 507 - btf = btf__parse_split(*argv, base_btf); 505 + const char sysfs_prefix[] = "/sys/kernel/btf/"; 506 + const char sysfs_vmlinux[] = "/sys/kernel/btf/vmlinux"; 507 + 508 + if (!base_btf && 509 + strncmp(*argv, sysfs_prefix, sizeof(sysfs_prefix) - 1) == 0 && 510 + strcmp(*argv, sysfs_vmlinux) != 0) { 511 + base = btf__parse(sysfs_vmlinux, NULL); 512 + if (libbpf_get_error(base)) { 513 + p_err("failed to parse vmlinux BTF at '%s': %ld\n", 514 + sysfs_vmlinux, libbpf_get_error(base)); 515 + base = NULL; 516 + } 517 + } 518 + 519 + btf = btf__parse_split(*argv, base ?: base_btf); 508 520 if (IS_ERR(btf)) { 509 521 err = -PTR_ERR(btf); 510 522 btf = NULL; ··· 583 567 done: 584 568 close(fd); 585 569 btf__free(btf); 570 + btf__free(base); 586 571 return err; 587 572 } 588 573 ··· 768 751 printf("name [%s] ", name); 769 752 else if (name && name[0]) 770 753 printf("name %s ", name); 754 + else 755 + printf("name <anon> "); 771 756 printf("size %uB", info->btf_size); 772 757 773 758 n = 0;

+29 -1

tools/bpf/bpftool/prog.c

··· 1717 1717 .ratio_desc = "LLC misses per million insns", 1718 1718 .ratio_mul = 1e6, 1719 1719 }, 1720 + { 1721 + .name = "itlb_misses", 1722 + .attr = { 1723 + .type = PERF_TYPE_HW_CACHE, 1724 + .config = 1725 + PERF_COUNT_HW_CACHE_ITLB | 1726 + (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1727 + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), 1728 + .exclude_user = 1 1729 + }, 1730 + .ratio_metric = 2, 1731 + .ratio_desc = "itlb misses per million insns", 1732 + .ratio_mul = 1e6, 1733 + }, 1734 + { 1735 + .name = "dtlb_misses", 1736 + .attr = { 1737 + .type = PERF_TYPE_HW_CACHE, 1738 + .config = 1739 + PERF_COUNT_HW_CACHE_DTLB | 1740 + (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1741 + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), 1742 + .exclude_user = 1 1743 + }, 1744 + .ratio_metric = 2, 1745 + .ratio_desc = "dtlb misses per million insns", 1746 + .ratio_mul = 1e6, 1747 + }, 1720 1748 }; 1721 1749 1722 1750 static __u64 profile_total_count; ··· 2137 2109 " struct_ops | fentry | fexit | freplace | sk_lookup }\n" 2138 2110 " ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n" 2139 2111 " flow_dissector }\n" 2140 - " METRIC := { cycles | instructions | l1d_loads | llc_misses }\n" 2112 + " METRIC := { cycles | instructions | l1d_loads | llc_misses | itlb_misses | dtlb_misses }\n" 2141 2113 " " HELP_SPEC_OPTIONS "\n" 2142 2114 "", 2143 2115 bin_name, argv[-2]);

+3 -3

tools/bpf/resolve_btfids/main.c

··· 454 454 return -ENOMEM; 455 455 456 456 if (id->addr_cnt >= ADDR_CNT) { 457 - pr_err("FAILED symbol %s crossed the number of allowed lists", 457 + pr_err("FAILED symbol %s crossed the number of allowed lists\n", 458 458 id->name); 459 459 return -1; 460 460 } ··· 477 477 btf = btf__parse(obj->btf ?: obj->path, NULL); 478 478 err = libbpf_get_error(btf); 479 479 if (err) { 480 - pr_err("FAILED: load BTF from %s: %s", 481 - obj->path, strerror(err)); 480 + pr_err("FAILED: load BTF from %s: %s\n", 481 + obj->path, strerror(-err)); 482 482 return -1; 483 483 } 484 484

+44 -1

tools/include/uapi/linux/bpf.h

··· 557 557 __aligned_u64 line_info; /* line info */ 558 558 __u32 line_info_cnt; /* number of bpf_line_info records */ 559 559 __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ 560 - __u32 attach_prog_fd; /* 0 to attach to vmlinux */ 560 + union { 561 + /* valid prog_fd to attach to bpf prog */ 562 + __u32 attach_prog_fd; 563 + /* or valid module BTF object fd or 0 to attach to vmlinux */ 564 + __u32 attach_btf_obj_fd; 565 + }; 561 566 }; 562 567 563 568 struct { /* anonymous struct used by BPF_OBJ_* commands */ ··· 3792 3787 * *ARG_PTR_TO_BTF_ID* of type *task_struct*. 3793 3788 * Return 3794 3789 * Pointer to the current task. 3790 + * 3791 + * long bpf_bprm_opts_set(struct linux_binprm *bprm, u64 flags) 3792 + * Description 3793 + * Set or clear certain options on *bprm*: 3794 + * 3795 + * **BPF_F_BPRM_SECUREEXEC** Set the secureexec bit 3796 + * which sets the **AT_SECURE** auxv for glibc. The bit 3797 + * is cleared if the flag is not specified. 3798 + * Return 3799 + * **-EINVAL** if invalid *flags* are passed, zero otherwise. 3800 + * 3801 + * u64 bpf_ktime_get_coarse_ns(void) 3802 + * Description 3803 + * Return a coarse-grained version of the time elapsed since 3804 + * system boot, in nanoseconds. Does not include time the system 3805 + * was suspended. 3806 + * 3807 + * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**) 3808 + * Return 3809 + * Current *ktime*. 3810 + * 3811 + * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size) 3812 + * Description 3813 + * Returns the stored IMA hash of the *inode* (if it's avaialable). 3814 + * If the hash is larger than *size*, then only *size* 3815 + * bytes will be copied to *dst* 3816 + * Return 3817 + * The **hash_algo** is returned on success, 3818 + * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if 3819 + * invalid arguments are passed. 3795 3820 */ 3796 3821 #define __BPF_FUNC_MAPPER(FN) \ 3797 3822 FN(unspec), \ ··· 3983 3948 FN(task_storage_get), \ 3984 3949 FN(task_storage_delete), \ 3985 3950 FN(get_current_task_btf), \ 3951 + FN(bprm_opts_set), \ 3952 + FN(ktime_get_coarse_ns), \ 3953 + FN(ima_inode_hash), \ 3986 3954 /* */ 3987 3955 3988 3956 /* integer value in 'imm' field of BPF_CALL instruction selects which helper ··· 4155 4117 BPF_LWT_ENCAP_SEG6, 4156 4118 BPF_LWT_ENCAP_SEG6_INLINE, 4157 4119 BPF_LWT_ENCAP_IP, 4120 + }; 4121 + 4122 + /* Flags for bpf_bprm_opts_set helper */ 4123 + enum { 4124 + BPF_F_BPRM_SECUREEXEC = (1ULL << 0), 4158 4125 }; 4159 4126 4160 4127 #define __bpf_md_ptr(type, name) \

+72 -32

tools/lib/bpf/bpf.c

··· 67 67 68 68 static inline int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size) 69 69 { 70 + int retries = 5; 70 71 int fd; 71 72 72 73 do { 73 74 fd = sys_bpf(BPF_PROG_LOAD, attr, size); 74 - } while (fd < 0 && errno == EAGAIN); 75 + } while (fd < 0 && errno == EAGAIN && retries-- > 0); 75 76 76 77 return fd; 77 78 } ··· 215 214 return info; 216 215 } 217 216 218 - int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, 219 - char *log_buf, size_t log_buf_sz) 217 + int libbpf__bpf_prog_load(const struct bpf_prog_load_params *load_attr) 220 218 { 221 219 void *finfo = NULL, *linfo = NULL; 222 220 union bpf_attr attr; 223 - __u32 log_level; 224 221 int fd; 225 222 226 - if (!load_attr || !log_buf != !log_buf_sz) 223 + if (!load_attr->log_buf != !load_attr->log_buf_sz) 227 224 return -EINVAL; 228 225 229 - log_level = load_attr->log_level; 230 - if (log_level > (4 | 2 | 1) || (log_level && !log_buf)) 226 + if (load_attr->log_level > (4 | 2 | 1) || (load_attr->log_level && !load_attr->log_buf)) 231 227 return -EINVAL; 232 228 233 229 memset(&attr, 0, sizeof(attr)); 234 230 attr.prog_type = load_attr->prog_type; 235 231 attr.expected_attach_type = load_attr->expected_attach_type; 236 - if (attr.prog_type == BPF_PROG_TYPE_STRUCT_OPS || 237 - attr.prog_type == BPF_PROG_TYPE_LSM) { 238 - attr.attach_btf_id = load_attr->attach_btf_id; 239 - } else if (attr.prog_type == BPF_PROG_TYPE_TRACING || 240 - attr.prog_type == BPF_PROG_TYPE_EXT) { 241 - attr.attach_btf_id = load_attr->attach_btf_id; 232 + 233 + if (load_attr->attach_prog_fd) 242 234 attr.attach_prog_fd = load_attr->attach_prog_fd; 243 - } else { 244 - attr.prog_ifindex = load_attr->prog_ifindex; 245 - attr.kern_version = load_attr->kern_version; 246 - } 247 - attr.insn_cnt = (__u32)load_attr->insns_cnt; 235 + else 236 + attr.attach_btf_obj_fd = load_attr->attach_btf_obj_fd; 237 + attr.attach_btf_id = load_attr->attach_btf_id; 238 + 239 + attr.prog_ifindex = load_attr->prog_ifindex; 240 + attr.kern_version = load_attr->kern_version; 241 + 242 + attr.insn_cnt = (__u32)load_attr->insn_cnt; 248 243 attr.insns = ptr_to_u64(load_attr->insns); 249 244 attr.license = ptr_to_u64(load_attr->license); 250 245 251 - attr.log_level = log_level; 252 - if (log_level) { 253 - attr.log_buf = ptr_to_u64(log_buf); 254 - attr.log_size = log_buf_sz; 255 - } else { 256 - attr.log_buf = ptr_to_u64(NULL); 257 - attr.log_size = 0; 246 + attr.log_level = load_attr->log_level; 247 + if (attr.log_level) { 248 + attr.log_buf = ptr_to_u64(load_attr->log_buf); 249 + attr.log_size = load_attr->log_buf_sz; 258 250 } 259 251 260 252 attr.prog_btf_fd = load_attr->prog_btf_fd; 253 + attr.prog_flags = load_attr->prog_flags; 254 + 261 255 attr.func_info_rec_size = load_attr->func_info_rec_size; 262 256 attr.func_info_cnt = load_attr->func_info_cnt; 263 257 attr.func_info = ptr_to_u64(load_attr->func_info); 258 + 264 259 attr.line_info_rec_size = load_attr->line_info_rec_size; 265 260 attr.line_info_cnt = load_attr->line_info_cnt; 266 261 attr.line_info = ptr_to_u64(load_attr->line_info); 262 + 267 263 if (load_attr->name) 268 264 memcpy(attr.prog_name, load_attr->name, 269 - min(strlen(load_attr->name), BPF_OBJ_NAME_LEN - 1)); 270 - attr.prog_flags = load_attr->prog_flags; 265 + min(strlen(load_attr->name), (size_t)BPF_OBJ_NAME_LEN - 1)); 271 266 272 267 fd = sys_bpf_prog_load(&attr, sizeof(attr)); 273 268 if (fd >= 0) ··· 303 306 } 304 307 305 308 fd = sys_bpf_prog_load(&attr, sizeof(attr)); 306 - 307 309 if (fd >= 0) 308 310 goto done; 309 311 } 310 312 311 - if (log_level || !log_buf) 313 + if (load_attr->log_level || !load_attr->log_buf) 312 314 goto done; 313 315 314 316 /* Try again with log */ 315 - attr.log_buf = ptr_to_u64(log_buf); 316 - attr.log_size = log_buf_sz; 317 + attr.log_buf = ptr_to_u64(load_attr->log_buf); 318 + attr.log_size = load_attr->log_buf_sz; 317 319 attr.log_level = 1; 318 - log_buf[0] = 0; 320 + load_attr->log_buf[0] = 0; 321 + 319 322 fd = sys_bpf_prog_load(&attr, sizeof(attr)); 320 323 done: 321 324 free(finfo); 322 325 free(linfo); 323 326 return fd; 327 + } 328 + 329 + int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, 330 + char *log_buf, size_t log_buf_sz) 331 + { 332 + struct bpf_prog_load_params p = {}; 333 + 334 + if (!load_attr || !log_buf != !log_buf_sz) 335 + return -EINVAL; 336 + 337 + p.prog_type = load_attr->prog_type; 338 + p.expected_attach_type = load_attr->expected_attach_type; 339 + switch (p.prog_type) { 340 + case BPF_PROG_TYPE_STRUCT_OPS: 341 + case BPF_PROG_TYPE_LSM: 342 + p.attach_btf_id = load_attr->attach_btf_id; 343 + break; 344 + case BPF_PROG_TYPE_TRACING: 345 + case BPF_PROG_TYPE_EXT: 346 + p.attach_btf_id = load_attr->attach_btf_id; 347 + p.attach_prog_fd = load_attr->attach_prog_fd; 348 + break; 349 + default: 350 + p.prog_ifindex = load_attr->prog_ifindex; 351 + p.kern_version = load_attr->kern_version; 352 + } 353 + p.insn_cnt = load_attr->insns_cnt; 354 + p.insns = load_attr->insns; 355 + p.license = load_attr->license; 356 + p.log_level = load_attr->log_level; 357 + p.log_buf = log_buf; 358 + p.log_buf_sz = log_buf_sz; 359 + p.prog_btf_fd = load_attr->prog_btf_fd; 360 + p.func_info_rec_size = load_attr->func_info_rec_size; 361 + p.func_info_cnt = load_attr->func_info_cnt; 362 + p.func_info = load_attr->func_info; 363 + p.line_info_rec_size = load_attr->line_info_rec_size; 364 + p.line_info_cnt = load_attr->line_info_cnt; 365 + p.line_info = load_attr->line_info; 366 + p.name = load_attr->name; 367 + p.prog_flags = load_attr->prog_flags; 368 + 369 + return libbpf__bpf_prog_load(&p); 324 370 } 325 371 326 372 int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,

+44 -30

tools/lib/bpf/btf.c

··· 432 432 return btf->start_id + btf->nr_types - 1; 433 433 } 434 434 435 + const struct btf *btf__base_btf(const struct btf *btf) 436 + { 437 + return btf->base_btf; 438 + } 439 + 435 440 /* internal helper returning non-const pointer to a type */ 436 441 static struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id) 437 442 { ··· 679 674 680 675 __s32 btf__find_by_name(const struct btf *btf, const char *type_name) 681 676 { 682 - __u32 i; 677 + __u32 i, nr_types = btf__get_nr_types(btf); 683 678 684 679 if (!strcmp(type_name, "void")) 685 680 return 0; 686 681 687 - for (i = 1; i <= btf->nr_types; i++) { 682 + for (i = 1; i <= nr_types; i++) { 688 683 const struct btf_type *t = btf__type_by_id(btf, i); 689 684 const char *name = btf__name_by_offset(btf, t->name_off); 690 685 ··· 698 693 __s32 btf__find_by_name_kind(const struct btf *btf, const char *type_name, 699 694 __u32 kind) 700 695 { 701 - __u32 i; 696 + __u32 i, nr_types = btf__get_nr_types(btf); 702 697 703 698 if (kind == BTF_KIND_UNKN || !strcmp(type_name, "void")) 704 699 return 0; 705 700 706 - for (i = 1; i <= btf->nr_types; i++) { 701 + for (i = 1; i <= nr_types; i++) { 707 702 const struct btf_type *t = btf__type_by_id(btf, i); 708 703 const char *name; 709 704 ··· 1323 1318 return btf__str_by_offset(btf, offset); 1324 1319 } 1325 1320 1326 - int btf__get_from_id(__u32 id, struct btf **btf) 1321 + struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf) 1327 1322 { 1328 - struct bpf_btf_info btf_info = { 0 }; 1323 + struct bpf_btf_info btf_info; 1329 1324 __u32 len = sizeof(btf_info); 1330 1325 __u32 last_size; 1331 - int btf_fd; 1326 + struct btf *btf; 1332 1327 void *ptr; 1333 1328 int err; 1334 - 1335 - err = 0; 1336 - *btf = NULL; 1337 - btf_fd = bpf_btf_get_fd_by_id(id); 1338 - if (btf_fd < 0) 1339 - return 0; 1340 1329 1341 1330 /* we won't know btf_size until we call bpf_obj_get_info_by_fd(). so 1342 1331 * let's start with a sane default - 4KiB here - and resize it only if 1343 1332 * bpf_obj_get_info_by_fd() needs a bigger buffer. 1344 1333 */ 1345 - btf_info.btf_size = 4096; 1346 - last_size = btf_info.btf_size; 1334 + last_size = 4096; 1347 1335 ptr = malloc(last_size); 1348 - if (!ptr) { 1349 - err = -ENOMEM; 1350 - goto exit_free; 1351 - } 1336 + if (!ptr) 1337 + return ERR_PTR(-ENOMEM); 1352 1338 1353 - memset(ptr, 0, last_size); 1339 + memset(&btf_info, 0, sizeof(btf_info)); 1354 1340 btf_info.btf = ptr_to_u64(ptr); 1341 + btf_info.btf_size = last_size; 1355 1342 err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len); 1356 1343 1357 1344 if (!err && btf_info.btf_size > last_size) { ··· 1352 1355 last_size = btf_info.btf_size; 1353 1356 temp_ptr = realloc(ptr, last_size); 1354 1357 if (!temp_ptr) { 1355 - err = -ENOMEM; 1358 + btf = ERR_PTR(-ENOMEM); 1356 1359 goto exit_free; 1357 1360 } 1358 1361 ptr = temp_ptr; 1359 - memset(ptr, 0, last_size); 1362 + 1363 + len = sizeof(btf_info); 1364 + memset(&btf_info, 0, sizeof(btf_info)); 1360 1365 btf_info.btf = ptr_to_u64(ptr); 1366 + btf_info.btf_size = last_size; 1367 + 1361 1368 err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len); 1362 1369 } 1363 1370 1364 1371 if (err || btf_info.btf_size > last_size) { 1365 - err = errno; 1372 + btf = err ? ERR_PTR(-errno) : ERR_PTR(-E2BIG); 1366 1373 goto exit_free; 1367 1374 } 1368 1375 1369 - *btf = btf__new((__u8 *)(long)btf_info.btf, btf_info.btf_size); 1370 - if (IS_ERR(*btf)) { 1371 - err = PTR_ERR(*btf); 1372 - *btf = NULL; 1373 - } 1376 + btf = btf_new(ptr, btf_info.btf_size, base_btf); 1374 1377 1375 1378 exit_free: 1376 - close(btf_fd); 1377 1379 free(ptr); 1380 + return btf; 1381 + } 1378 1382 1379 - return err; 1383 + int btf__get_from_id(__u32 id, struct btf **btf) 1384 + { 1385 + struct btf *res; 1386 + int btf_fd; 1387 + 1388 + *btf = NULL; 1389 + btf_fd = bpf_btf_get_fd_by_id(id); 1390 + if (btf_fd < 0) 1391 + return -errno; 1392 + 1393 + res = btf_get_from_fd(btf_fd, NULL); 1394 + close(btf_fd); 1395 + if (IS_ERR(res)) 1396 + return PTR_ERR(res); 1397 + 1398 + *btf = res; 1399 + return 0; 1380 1400 } 1381 1401 1382 1402 int btf__get_map_kv_tids(const struct btf *btf, const char *map_name,

+1

tools/lib/bpf/btf.h

··· 51 51 LIBBPF_API __s32 btf__find_by_name_kind(const struct btf *btf, 52 52 const char *type_name, __u32 kind); 53 53 LIBBPF_API __u32 btf__get_nr_types(const struct btf *btf); 54 + LIBBPF_API const struct btf *btf__base_btf(const struct btf *btf); 54 55 LIBBPF_API const struct btf_type *btf__type_by_id(const struct btf *btf, 55 56 __u32 id); 56 57 LIBBPF_API size_t btf__pointer_size(const struct btf *btf);

+411 -118

tools/lib/bpf/libbpf.c

··· 176 176 FEAT_PROBE_READ_KERN, 177 177 /* BPF_PROG_BIND_MAP is supported */ 178 178 FEAT_PROG_BIND_MAP, 179 + /* Kernel support for module BTFs */ 180 + FEAT_MODULE_BTF, 179 181 __FEAT_CNT, 180 182 }; 181 183 ··· 278 276 enum bpf_prog_type type; 279 277 enum bpf_attach_type expected_attach_type; 280 278 int prog_ifindex; 279 + __u32 attach_btf_obj_fd; 281 280 __u32 attach_btf_id; 282 281 __u32 attach_prog_fd; 283 282 void *func_info; ··· 405 402 406 403 static LIST_HEAD(bpf_objects_list); 407 404 405 + struct module_btf { 406 + struct btf *btf; 407 + char *name; 408 + __u32 id; 409 + int fd; 410 + }; 411 + 408 412 struct bpf_object { 409 413 char name[BPF_OBJ_NAME_LEN]; 410 414 char license[64]; ··· 472 462 struct list_head list; 473 463 474 464 struct btf *btf; 465 + struct btf_ext *btf_ext; 466 + 475 467 /* Parse and load BTF vmlinux if any of the programs in the object need 476 468 * it at load time. 477 469 */ 478 470 struct btf *btf_vmlinux; 479 - struct btf_ext *btf_ext; 471 + /* vmlinux BTF override for CO-RE relocations */ 472 + struct btf *btf_vmlinux_override; 473 + /* Lazily initialized kernel module BTFs */ 474 + struct module_btf *btf_modules; 475 + bool btf_modules_loaded; 476 + size_t btf_module_cnt; 477 + size_t btf_module_cap; 480 478 481 479 void *priv; 482 480 bpf_object_clear_priv_t clear_priv; ··· 3978 3960 return ret >= 0; 3979 3961 } 3980 3962 3963 + static int probe_module_btf(void) 3964 + { 3965 + static const char strs[] = "\0int"; 3966 + __u32 types[] = { 3967 + /* int */ 3968 + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), 3969 + }; 3970 + struct bpf_btf_info info; 3971 + __u32 len = sizeof(info); 3972 + char name[16]; 3973 + int fd, err; 3974 + 3975 + fd = libbpf__load_raw_btf((char *)types, sizeof(types), strs, sizeof(strs)); 3976 + if (fd < 0) 3977 + return 0; /* BTF not supported at all */ 3978 + 3979 + memset(&info, 0, sizeof(info)); 3980 + info.name = ptr_to_u64(name); 3981 + info.name_len = sizeof(name); 3982 + 3983 + /* check that BPF_OBJ_GET_INFO_BY_FD supports specifying name pointer; 3984 + * kernel's module BTF support coincides with support for 3985 + * name/name_len fields in struct bpf_btf_info. 3986 + */ 3987 + err = bpf_obj_get_info_by_fd(fd, &info, &len); 3988 + close(fd); 3989 + return !err; 3990 + } 3991 + 3981 3992 enum kern_feature_result { 3982 3993 FEAT_UNKNOWN = 0, 3983 3994 FEAT_SUPPORTED = 1, ··· 4050 4003 }, 4051 4004 [FEAT_PROG_BIND_MAP] = { 4052 4005 "BPF_PROG_BIND_MAP support", probe_prog_bind_map, 4053 - } 4006 + }, 4007 + [FEAT_MODULE_BTF] = { 4008 + "module BTF support", probe_module_btf, 4009 + }, 4054 4010 }; 4055 4011 4056 4012 static bool kernel_supports(enum kern_feature_id feat_id) ··· 4653 4603 return n; 4654 4604 } 4655 4605 4656 - /* dynamically sized list of type IDs */ 4657 - struct ids_vec { 4658 - __u32 *data; 4606 + struct core_cand 4607 + { 4608 + const struct btf *btf; 4609 + const struct btf_type *t; 4610 + const char *name; 4611 + __u32 id; 4612 + }; 4613 + 4614 + /* dynamically sized list of type IDs and its associated struct btf */ 4615 + struct core_cand_list { 4616 + struct core_cand *cands; 4659 4617 int len; 4660 4618 }; 4661 4619 4662 - static void bpf_core_free_cands(struct ids_vec *cand_ids) 4620 + static void bpf_core_free_cands(struct core_cand_list *cands) 4663 4621 { 4664 - free(cand_ids->data); 4665 - free(cand_ids); 4622 + free(cands->cands); 4623 + free(cands); 4666 4624 } 4667 4625 4668 - static struct ids_vec *bpf_core_find_cands(const struct btf *local_btf, 4669 - __u32 local_type_id, 4670 - const struct btf *targ_btf) 4626 + static int bpf_core_add_cands(struct core_cand *local_cand, 4627 + size_t local_essent_len, 4628 + const struct btf *targ_btf, 4629 + const char *targ_btf_name, 4630 + int targ_start_id, 4631 + struct core_cand_list *cands) 4671 4632 { 4672 - size_t local_essent_len, targ_essent_len; 4673 - const char *local_name, *targ_name; 4674 - const struct btf_type *t, *local_t; 4675 - struct ids_vec *cand_ids; 4676 - __u32 *new_ids; 4677 - int i, err, n; 4678 - 4679 - local_t = btf__type_by_id(local_btf, local_type_id); 4680 - if (!local_t) 4681 - return ERR_PTR(-EINVAL); 4682 - 4683 - local_name = btf__name_by_offset(local_btf, local_t->name_off); 4684 - if (str_is_empty(local_name)) 4685 - return ERR_PTR(-EINVAL); 4686 - local_essent_len = bpf_core_essential_name_len(local_name); 4687 - 4688 - cand_ids = calloc(1, sizeof(*cand_ids)); 4689 - if (!cand_ids) 4690 - return ERR_PTR(-ENOMEM); 4633 + struct core_cand *new_cands, *cand; 4634 + const struct btf_type *t; 4635 + const char *targ_name; 4636 + size_t targ_essent_len; 4637 + int n, i; 4691 4638 4692 4639 n = btf__get_nr_types(targ_btf); 4693 - for (i = 1; i <= n; i++) { 4640 + for (i = targ_start_id; i <= n; i++) { 4694 4641 t = btf__type_by_id(targ_btf, i); 4695 - if (btf_kind(t) != btf_kind(local_t)) 4642 + if (btf_kind(t) != btf_kind(local_cand->t)) 4696 4643 continue; 4697 4644 4698 4645 targ_name = btf__name_by_offset(targ_btf, t->name_off); ··· 4700 4653 if (targ_essent_len != local_essent_len) 4701 4654 continue; 4702 4655 4703 - if (strncmp(local_name, targ_name, local_essent_len) == 0) { 4704 - pr_debug("CO-RE relocating [%d] %s %s: found target candidate [%d] %s %s\n", 4705 - local_type_id, btf_kind_str(local_t), 4706 - local_name, i, btf_kind_str(t), targ_name); 4707 - new_ids = libbpf_reallocarray(cand_ids->data, 4708 - cand_ids->len + 1, 4709 - sizeof(*cand_ids->data)); 4710 - if (!new_ids) { 4711 - err = -ENOMEM; 4712 - goto err_out; 4713 - } 4714 - cand_ids->data = new_ids; 4715 - cand_ids->data[cand_ids->len++] = i; 4716 - } 4656 + if (strncmp(local_cand->name, targ_name, local_essent_len) != 0) 4657 + continue; 4658 + 4659 + pr_debug("CO-RE relocating [%d] %s %s: found target candidate [%d] %s %s in [%s]\n", 4660 + local_cand->id, btf_kind_str(local_cand->t), 4661 + local_cand->name, i, btf_kind_str(t), targ_name, 4662 + targ_btf_name); 4663 + new_cands = libbpf_reallocarray(cands->cands, cands->len + 1, 4664 + sizeof(*cands->cands)); 4665 + if (!new_cands) 4666 + return -ENOMEM; 4667 + 4668 + cand = &new_cands[cands->len]; 4669 + cand->btf = targ_btf; 4670 + cand->t = t; 4671 + cand->name = targ_name; 4672 + cand->id = i; 4673 + 4674 + cands->cands = new_cands; 4675 + cands->len++; 4717 4676 } 4718 - return cand_ids; 4677 + return 0; 4678 + } 4679 + 4680 + static int load_module_btfs(struct bpf_object *obj) 4681 + { 4682 + struct bpf_btf_info info; 4683 + struct module_btf *mod_btf; 4684 + struct btf *btf; 4685 + char name[64]; 4686 + __u32 id = 0, len; 4687 + int err, fd; 4688 + 4689 + if (obj->btf_modules_loaded) 4690 + return 0; 4691 + 4692 + /* don't do this again, even if we find no module BTFs */ 4693 + obj->btf_modules_loaded = true; 4694 + 4695 + /* kernel too old to support module BTFs */ 4696 + if (!kernel_supports(FEAT_MODULE_BTF)) 4697 + return 0; 4698 + 4699 + while (true) { 4700 + err = bpf_btf_get_next_id(id, &id); 4701 + if (err && errno == ENOENT) 4702 + return 0; 4703 + if (err) { 4704 + err = -errno; 4705 + pr_warn("failed to iterate BTF objects: %d\n", err); 4706 + return err; 4707 + } 4708 + 4709 + fd = bpf_btf_get_fd_by_id(id); 4710 + if (fd < 0) { 4711 + if (errno == ENOENT) 4712 + continue; /* expected race: BTF was unloaded */ 4713 + err = -errno; 4714 + pr_warn("failed to get BTF object #%d FD: %d\n", id, err); 4715 + return err; 4716 + } 4717 + 4718 + len = sizeof(info); 4719 + memset(&info, 0, sizeof(info)); 4720 + info.name = ptr_to_u64(name); 4721 + info.name_len = sizeof(name); 4722 + 4723 + err = bpf_obj_get_info_by_fd(fd, &info, &len); 4724 + if (err) { 4725 + err = -errno; 4726 + pr_warn("failed to get BTF object #%d info: %d\n", id, err); 4727 + goto err_out; 4728 + } 4729 + 4730 + /* ignore non-module BTFs */ 4731 + if (!info.kernel_btf || strcmp(name, "vmlinux") == 0) { 4732 + close(fd); 4733 + continue; 4734 + } 4735 + 4736 + btf = btf_get_from_fd(fd, obj->btf_vmlinux); 4737 + if (IS_ERR(btf)) { 4738 + pr_warn("failed to load module [%s]'s BTF object #%d: %ld\n", 4739 + name, id, PTR_ERR(btf)); 4740 + err = PTR_ERR(btf); 4741 + goto err_out; 4742 + } 4743 + 4744 + err = btf_ensure_mem((void **)&obj->btf_modules, &obj->btf_module_cap, 4745 + sizeof(*obj->btf_modules), obj->btf_module_cnt + 1); 4746 + if (err) 4747 + goto err_out; 4748 + 4749 + mod_btf = &obj->btf_modules[obj->btf_module_cnt++]; 4750 + 4751 + mod_btf->btf = btf; 4752 + mod_btf->id = id; 4753 + mod_btf->fd = fd; 4754 + mod_btf->name = strdup(name); 4755 + if (!mod_btf->name) { 4756 + err = -ENOMEM; 4757 + goto err_out; 4758 + } 4759 + continue; 4760 + 4719 4761 err_out: 4720 - bpf_core_free_cands(cand_ids); 4762 + close(fd); 4763 + return err; 4764 + } 4765 + 4766 + return 0; 4767 + } 4768 + 4769 + static struct core_cand_list * 4770 + bpf_core_find_cands(struct bpf_object *obj, const struct btf *local_btf, __u32 local_type_id) 4771 + { 4772 + struct core_cand local_cand = {}; 4773 + struct core_cand_list *cands; 4774 + const struct btf *main_btf; 4775 + size_t local_essent_len; 4776 + int err, i; 4777 + 4778 + local_cand.btf = local_btf; 4779 + local_cand.t = btf__type_by_id(local_btf, local_type_id); 4780 + if (!local_cand.t) 4781 + return ERR_PTR(-EINVAL); 4782 + 4783 + local_cand.name = btf__name_by_offset(local_btf, local_cand.t->name_off); 4784 + if (str_is_empty(local_cand.name)) 4785 + return ERR_PTR(-EINVAL); 4786 + local_essent_len = bpf_core_essential_name_len(local_cand.name); 4787 + 4788 + cands = calloc(1, sizeof(*cands)); 4789 + if (!cands) 4790 + return ERR_PTR(-ENOMEM); 4791 + 4792 + /* Attempt to find target candidates in vmlinux BTF first */ 4793 + main_btf = obj->btf_vmlinux_override ?: obj->btf_vmlinux; 4794 + err = bpf_core_add_cands(&local_cand, local_essent_len, main_btf, "vmlinux", 1, cands); 4795 + if (err) 4796 + goto err_out; 4797 + 4798 + /* if vmlinux BTF has any candidate, don't got for module BTFs */ 4799 + if (cands->len) 4800 + return cands; 4801 + 4802 + /* if vmlinux BTF was overridden, don't attempt to load module BTFs */ 4803 + if (obj->btf_vmlinux_override) 4804 + return cands; 4805 + 4806 + /* now look through module BTFs, trying to still find candidates */ 4807 + err = load_module_btfs(obj); 4808 + if (err) 4809 + goto err_out; 4810 + 4811 + for (i = 0; i < obj->btf_module_cnt; i++) { 4812 + err = bpf_core_add_cands(&local_cand, local_essent_len, 4813 + obj->btf_modules[i].btf, 4814 + obj->btf_modules[i].name, 4815 + btf__get_nr_types(obj->btf_vmlinux) + 1, 4816 + cands); 4817 + if (err) 4818 + goto err_out; 4819 + } 4820 + 4821 + return cands; 4822 + err_out: 4823 + bpf_core_free_cands(cands); 4721 4824 return ERR_PTR(err); 4722 4825 } 4723 4826 ··· 5861 5664 const struct bpf_core_relo *relo, 5862 5665 int relo_idx, 5863 5666 const struct btf *local_btf, 5864 - const struct btf *targ_btf, 5865 5667 struct hashmap *cand_cache) 5866 5668 { 5867 5669 struct bpf_core_spec local_spec, cand_spec, targ_spec = {}; ··· 5868 5672 struct bpf_core_relo_res cand_res, targ_res; 5869 5673 const struct btf_type *local_type; 5870 5674 const char *local_name; 5871 - struct ids_vec *cand_ids; 5872 - __u32 local_id, cand_id; 5675 + struct core_cand_list *cands = NULL; 5676 + __u32 local_id; 5873 5677 const char *spec_str; 5874 5678 int i, j, err; 5875 5679 ··· 5916 5720 return -EOPNOTSUPP; 5917 5721 } 5918 5722 5919 - if (!hashmap__find(cand_cache, type_key, (void **)&cand_ids)) { 5920 - cand_ids = bpf_core_find_cands(local_btf, local_id, targ_btf); 5921 - if (IS_ERR(cand_ids)) { 5922 - pr_warn("prog '%s': relo #%d: target candidate search failed for [%d] %s %s: %ld", 5723 + if (!hashmap__find(cand_cache, type_key, (void **)&cands)) { 5724 + cands = bpf_core_find_cands(prog->obj, local_btf, local_id); 5725 + if (IS_ERR(cands)) { 5726 + pr_warn("prog '%s': relo #%d: target candidate search failed for [%d] %s %s: %ld\n", 5923 5727 prog->name, relo_idx, local_id, btf_kind_str(local_type), 5924 - local_name, PTR_ERR(cand_ids)); 5925 - return PTR_ERR(cand_ids); 5728 + local_name, PTR_ERR(cands)); 5729 + return PTR_ERR(cands); 5926 5730 } 5927 - err = hashmap__set(cand_cache, type_key, cand_ids, NULL, NULL); 5731 + err = hashmap__set(cand_cache, type_key, cands, NULL, NULL); 5928 5732 if (err) { 5929 - bpf_core_free_cands(cand_ids); 5733 + bpf_core_free_cands(cands); 5930 5734 return err; 5931 5735 } 5932 5736 } 5933 5737 5934 - for (i = 0, j = 0; i < cand_ids->len; i++) { 5935 - cand_id = cand_ids->data[i]; 5936 - err = bpf_core_spec_match(&local_spec, targ_btf, cand_id, &cand_spec); 5738 + for (i = 0, j = 0; i < cands->len; i++) { 5739 + err = bpf_core_spec_match(&local_spec, cands->cands[i].btf, 5740 + cands->cands[i].id, &cand_spec); 5937 5741 if (err < 0) { 5938 5742 pr_warn("prog '%s': relo #%d: error matching candidate #%d ", 5939 5743 prog->name, relo_idx, i); ··· 5977 5781 return -EINVAL; 5978 5782 } 5979 5783 5980 - cand_ids->data[j++] = cand_spec.root_type_id; 5784 + cands->cands[j++] = cands->cands[i]; 5981 5785 } 5982 5786 5983 5787 /* ··· 5989 5793 * depending on relo's kind. 5990 5794 */ 5991 5795 if (j > 0) 5992 - cand_ids->len = j; 5796 + cands->len = j; 5993 5797 5994 5798 /* 5995 5799 * If no candidates were found, it might be both a programmer error, ··· 6033 5837 struct hashmap_entry *entry; 6034 5838 struct hashmap *cand_cache = NULL; 6035 5839 struct bpf_program *prog; 6036 - struct btf *targ_btf; 6037 5840 const char *sec_name; 6038 5841 int i, err = 0, insn_idx, sec_idx; 6039 5842 6040 5843 if (obj->btf_ext->core_relo_info.len == 0) 6041 5844 return 0; 6042 5845 6043 - if (targ_btf_path) 6044 - targ_btf = btf__parse(targ_btf_path, NULL); 6045 - else 6046 - targ_btf = obj->btf_vmlinux; 6047 - if (IS_ERR_OR_NULL(targ_btf)) { 6048 - pr_warn("failed to get target BTF: %ld\n", PTR_ERR(targ_btf)); 6049 - return PTR_ERR(targ_btf); 5846 + if (targ_btf_path) { 5847 + obj->btf_vmlinux_override = btf__parse(targ_btf_path, NULL); 5848 + if (IS_ERR_OR_NULL(obj->btf_vmlinux_override)) { 5849 + err = PTR_ERR(obj->btf_vmlinux_override); 5850 + pr_warn("failed to parse target BTF: %d\n", err); 5851 + return err; 5852 + } 6050 5853 } 6051 5854 6052 5855 cand_cache = hashmap__new(bpf_core_hash_fn, bpf_core_equal_fn, NULL); ··· 6097 5902 if (!prog->load) 6098 5903 continue; 6099 5904 6100 - err = bpf_core_apply_relo(prog, rec, i, obj->btf, 6101 - targ_btf, cand_cache); 5905 + err = bpf_core_apply_relo(prog, rec, i, obj->btf, cand_cache); 6102 5906 if (err) { 6103 5907 pr_warn("prog '%s': relo #%d: failed to relocate: %d\n", 6104 5908 prog->name, i, err); ··· 6107 5913 } 6108 5914 6109 5915 out: 6110 - /* obj->btf_vmlinux is freed at the end of object load phase */ 6111 - if (targ_btf != obj->btf_vmlinux) 6112 - btf__free(targ_btf); 5916 + /* obj->btf_vmlinux and module BTFs are freed after object load */ 5917 + btf__free(obj->btf_vmlinux_override); 5918 + obj->btf_vmlinux_override = NULL; 5919 + 6113 5920 if (!IS_ERR_OR_NULL(cand_cache)) { 6114 5921 hashmap__for_each_entry(cand_cache, entry, i) { 6115 5922 bpf_core_free_cands(entry->value); ··· 6821 6626 load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt, 6822 6627 char *license, __u32 kern_version, int *pfd) 6823 6628 { 6824 - struct bpf_load_program_attr load_attr; 6629 + struct bpf_prog_load_params load_attr = {}; 6825 6630 char *cp, errmsg[STRERR_BUFSIZE]; 6826 6631 size_t log_buf_size = 0; 6827 6632 char *log_buf = NULL; 6828 6633 int btf_fd, ret; 6829 6634 6635 + if (prog->type == BPF_PROG_TYPE_UNSPEC) { 6636 + /* 6637 + * The program type must be set. Most likely we couldn't find a proper 6638 + * section definition at load time, and thus we didn't infer the type. 6639 + */ 6640 + pr_warn("prog '%s': missing BPF prog type, check ELF section name '%s'\n", 6641 + prog->name, prog->sec_name); 6642 + return -EINVAL; 6643 + } 6644 + 6830 6645 if (!insns || !insns_cnt) 6831 6646 return -EINVAL; 6832 6647 6833 - memset(&load_attr, 0, sizeof(struct bpf_load_program_attr)); 6834 6648 load_attr.prog_type = prog->type; 6835 6649 /* old kernels might not support specifying expected_attach_type */ 6836 6650 if (!kernel_supports(FEAT_EXP_ATTACH_TYPE) && prog->sec_def && ··· 6850 6646 if (kernel_supports(FEAT_PROG_NAME)) 6851 6647 load_attr.name = prog->name; 6852 6648 load_attr.insns = insns; 6853 - load_attr.insns_cnt = insns_cnt; 6649 + load_attr.insn_cnt = insns_cnt; 6854 6650 load_attr.license = license; 6855 - if (prog->type == BPF_PROG_TYPE_STRUCT_OPS || 6856 - prog->type == BPF_PROG_TYPE_LSM) { 6857 - load_attr.attach_btf_id = prog->attach_btf_id; 6858 - } else if (prog->type == BPF_PROG_TYPE_TRACING || 6859 - prog->type == BPF_PROG_TYPE_EXT) { 6651 + load_attr.attach_btf_id = prog->attach_btf_id; 6652 + if (prog->attach_prog_fd) 6860 6653 load_attr.attach_prog_fd = prog->attach_prog_fd; 6861 - load_attr.attach_btf_id = prog->attach_btf_id; 6862 - } else { 6863 - load_attr.kern_version = kern_version; 6864 - load_attr.prog_ifindex = prog->prog_ifindex; 6865 - } 6654 + else 6655 + load_attr.attach_btf_obj_fd = prog->attach_btf_obj_fd; 6656 + load_attr.attach_btf_id = prog->attach_btf_id; 6657 + load_attr.kern_version = kern_version; 6658 + load_attr.prog_ifindex = prog->prog_ifindex; 6659 + 6866 6660 /* specify func_info/line_info only if kernel supports them */ 6867 6661 btf_fd = bpf_object__btf_fd(prog->obj); 6868 6662 if (btf_fd >= 0 && kernel_supports(FEAT_BTF_FUNC)) { ··· 6884 6682 *log_buf = 0; 6885 6683 } 6886 6684 6887 - ret = bpf_load_program_xattr(&load_attr, log_buf, log_buf_size); 6685 + load_attr.log_buf = log_buf; 6686 + load_attr.log_buf_sz = log_buf_size; 6687 + ret = libbpf__bpf_prog_load(&load_attr); 6888 6688 6889 6689 if (ret >= 0) { 6890 6690 if (log_buf && load_attr.log_level) ··· 6927 6723 pr_warn("-- BEGIN DUMP LOG ---\n"); 6928 6724 pr_warn("\n%s\n", log_buf); 6929 6725 pr_warn("-- END LOG --\n"); 6930 - } else if (load_attr.insns_cnt >= BPF_MAXINSNS) { 6726 + } else if (load_attr.insn_cnt >= BPF_MAXINSNS) { 6931 6727 pr_warn("Program too large (%zu insns), at most %d insns\n", 6932 - load_attr.insns_cnt, BPF_MAXINSNS); 6728 + load_attr.insn_cnt, BPF_MAXINSNS); 6933 6729 ret = -LIBBPF_ERRNO__PROG2BIG; 6934 6730 } else if (load_attr.prog_type != BPF_PROG_TYPE_KPROBE) { 6935 6731 /* Wrong program type? */ ··· 6937 6733 6938 6734 load_attr.prog_type = BPF_PROG_TYPE_KPROBE; 6939 6735 load_attr.expected_attach_type = 0; 6940 - fd = bpf_load_program_xattr(&load_attr, NULL, 0); 6736 + load_attr.log_buf = NULL; 6737 + load_attr.log_buf_sz = 0; 6738 + fd = libbpf__bpf_prog_load(&load_attr); 6941 6739 if (fd >= 0) { 6942 6740 close(fd); 6943 6741 ret = -LIBBPF_ERRNO__PROGTYPE; ··· 6952 6746 return ret; 6953 6747 } 6954 6748 6955 - static int libbpf_find_attach_btf_id(struct bpf_program *prog); 6749 + static int libbpf_find_attach_btf_id(struct bpf_program *prog, int *btf_obj_fd, int *btf_type_id); 6956 6750 6957 6751 int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) 6958 6752 { 6959 - int err = 0, fd, i, btf_id; 6753 + int err = 0, fd, i; 6960 6754 6961 6755 if (prog->obj->loaded) { 6962 6756 pr_warn("prog '%s': can't load after object was loaded\n", prog->name); ··· 6966 6760 if ((prog->type == BPF_PROG_TYPE_TRACING || 6967 6761 prog->type == BPF_PROG_TYPE_LSM || 6968 6762 prog->type == BPF_PROG_TYPE_EXT) && !prog->attach_btf_id) { 6969 - btf_id = libbpf_find_attach_btf_id(prog); 6970 - if (btf_id <= 0) 6971 - return btf_id; 6972 - prog->attach_btf_id = btf_id; 6763 + int btf_obj_fd = 0, btf_type_id = 0; 6764 + 6765 + err = libbpf_find_attach_btf_id(prog, &btf_obj_fd, &btf_type_id); 6766 + if (err) 6767 + return err; 6768 + 6769 + prog->attach_btf_obj_fd = btf_obj_fd; 6770 + prog->attach_btf_id = btf_type_id; 6973 6771 } 6974 6772 6975 6773 if (prog->instances.nr < 0 || !prog->instances.fds) { ··· 7133 6923 7134 6924 bpf_object__for_each_program(prog, obj) { 7135 6925 prog->sec_def = find_sec_def(prog->sec_name); 7136 - if (!prog->sec_def) 6926 + if (!prog->sec_def) { 7137 6927 /* couldn't guess, but user might manually specify */ 6928 + pr_debug("prog '%s': unrecognized ELF section name '%s'\n", 6929 + prog->name, prog->sec_name); 7138 6930 continue; 6931 + } 7139 6932 7140 6933 if (prog->sec_def->is_sleepable) 7141 6934 prog->prog_flags |= BPF_F_SLEEPABLE; ··· 7484 7271 err = err ? : bpf_object__relocate(obj, attr->target_btf_path); 7485 7272 err = err ? : bpf_object__load_progs(obj, attr->log_level); 7486 7273 7274 + /* clean up module BTFs */ 7275 + for (i = 0; i < obj->btf_module_cnt; i++) { 7276 + close(obj->btf_modules[i].fd); 7277 + btf__free(obj->btf_modules[i].btf); 7278 + free(obj->btf_modules[i].name); 7279 + } 7280 + free(obj->btf_modules); 7281 + 7282 + /* clean up vmlinux BTF */ 7487 7283 btf__free(obj->btf_vmlinux); 7488 7284 obj->btf_vmlinux = NULL; 7489 7285 ··· 7871 7649 return map->pinned; 7872 7650 } 7873 7651 7652 + static void sanitize_pin_path(char *s) 7653 + { 7654 + /* bpffs disallows periods in path names */ 7655 + while (*s) { 7656 + if (*s == '.') 7657 + *s = '_'; 7658 + s++; 7659 + } 7660 + } 7661 + 7874 7662 int bpf_object__pin_maps(struct bpf_object *obj, const char *path) 7875 7663 { 7876 7664 struct bpf_map *map; ··· 7910 7678 err = -ENAMETOOLONG; 7911 7679 goto err_unpin_maps; 7912 7680 } 7681 + sanitize_pin_path(buf); 7913 7682 pin_path = buf; 7914 7683 } else if (!map->pin_path) { 7915 7684 continue; ··· 7955 7722 return -EINVAL; 7956 7723 else if (len >= PATH_MAX) 7957 7724 return -ENAMETOOLONG; 7725 + sanitize_pin_path(buf); 7958 7726 pin_path = buf; 7959 7727 } else if (!map->pin_path) { 7960 7728 continue; ··· 8841 8607 return btf__find_by_name_kind(btf, btf_type_name, kind); 8842 8608 } 8843 8609 8844 - static inline int __find_vmlinux_btf_id(struct btf *btf, const char *name, 8845 - enum bpf_attach_type attach_type) 8610 + static inline int find_attach_btf_id(struct btf *btf, const char *name, 8611 + enum bpf_attach_type attach_type) 8846 8612 { 8847 8613 int err; 8848 8614 ··· 8857 8623 BTF_KIND_FUNC); 8858 8624 else 8859 8625 err = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC); 8860 - 8861 - if (err <= 0) 8862 - pr_warn("%s is not found in vmlinux BTF\n", name); 8863 8626 8864 8627 return err; 8865 8628 } ··· 8873 8642 return -EINVAL; 8874 8643 } 8875 8644 8876 - err = __find_vmlinux_btf_id(btf, name, attach_type); 8645 + err = find_attach_btf_id(btf, name, attach_type); 8646 + if (err <= 0) 8647 + pr_warn("%s is not found in vmlinux BTF\n", name); 8648 + 8877 8649 btf__free(btf); 8878 8650 return err; 8879 8651 } ··· 8914 8680 return err; 8915 8681 } 8916 8682 8917 - static int libbpf_find_attach_btf_id(struct bpf_program *prog) 8683 + static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name, 8684 + enum bpf_attach_type attach_type, 8685 + int *btf_obj_fd, int *btf_type_id) 8686 + { 8687 + int ret, i; 8688 + 8689 + ret = find_attach_btf_id(obj->btf_vmlinux, attach_name, attach_type); 8690 + if (ret > 0) { 8691 + *btf_obj_fd = 0; /* vmlinux BTF */ 8692 + *btf_type_id = ret; 8693 + return 0; 8694 + } 8695 + if (ret != -ENOENT) 8696 + return ret; 8697 + 8698 + ret = load_module_btfs(obj); 8699 + if (ret) 8700 + return ret; 8701 + 8702 + for (i = 0; i < obj->btf_module_cnt; i++) { 8703 + const struct module_btf *mod = &obj->btf_modules[i]; 8704 + 8705 + ret = find_attach_btf_id(mod->btf, attach_name, attach_type); 8706 + if (ret > 0) { 8707 + *btf_obj_fd = mod->fd; 8708 + *btf_type_id = ret; 8709 + return 0; 8710 + } 8711 + if (ret == -ENOENT) 8712 + continue; 8713 + 8714 + return ret; 8715 + } 8716 + 8717 + return -ESRCH; 8718 + } 8719 + 8720 + static int libbpf_find_attach_btf_id(struct bpf_program *prog, int *btf_obj_fd, int *btf_type_id) 8918 8721 { 8919 8722 enum bpf_attach_type attach_type = prog->expected_attach_type; 8920 8723 __u32 attach_prog_fd = prog->attach_prog_fd; 8921 - const char *name = prog->sec_name; 8724 + const char *name = prog->sec_name, *attach_name; 8725 + const struct bpf_sec_def *sec = NULL; 8922 8726 int i, err; 8923 8727 8924 8728 if (!name) ··· 8967 8695 continue; 8968 8696 if (strncmp(name, section_defs[i].sec, section_defs[i].len)) 8969 8697 continue; 8970 - if (attach_prog_fd) 8971 - err = libbpf_find_prog_btf_id(name + section_defs[i].len, 8972 - attach_prog_fd); 8973 - else 8974 - err = __find_vmlinux_btf_id(prog->obj->btf_vmlinux, 8975 - name + section_defs[i].len, 8976 - attach_type); 8698 + 8699 + sec = &section_defs[i]; 8700 + break; 8701 + } 8702 + 8703 + if (!sec) { 8704 + pr_warn("failed to identify BTF ID based on ELF section name '%s'\n", name); 8705 + return -ESRCH; 8706 + } 8707 + attach_name = name + sec->len; 8708 + 8709 + /* BPF program's BTF ID */ 8710 + if (attach_prog_fd) { 8711 + err = libbpf_find_prog_btf_id(attach_name, attach_prog_fd); 8712 + if (err < 0) { 8713 + pr_warn("failed to find BPF program (FD %d) BTF ID for '%s': %d\n", 8714 + attach_prog_fd, attach_name, err); 8715 + return err; 8716 + } 8717 + *btf_obj_fd = 0; 8718 + *btf_type_id = err; 8719 + return 0; 8720 + } 8721 + 8722 + /* kernel/module BTF ID */ 8723 + err = find_kernel_btf_id(prog->obj, attach_name, attach_type, btf_obj_fd, btf_type_id); 8724 + if (err) { 8725 + pr_warn("failed to find kernel BTF type ID of '%s': %d\n", attach_name, err); 8977 8726 return err; 8978 8727 } 8979 - pr_warn("failed to identify btf_id based on ELF section name '%s'\n", name); 8980 - return -ESRCH; 8728 + return 0; 8981 8729 } 8982 8730 8983 8731 int libbpf_attach_type_by_name(const char *name, ··· 10886 10594 return btf_id; 10887 10595 10888 10596 prog->attach_btf_id = btf_id; 10597 + prog->attach_btf_obj_fd = 0; 10889 10598 prog->attach_prog_fd = attach_prog_fd; 10890 10599 return 0; 10891 10600 }

+3

tools/lib/bpf/libbpf.map

··· 340 340 341 341 LIBBPF_0.3.0 { 342 342 global: 343 + btf__base_btf; 343 344 btf__parse_elf_split; 344 345 btf__parse_raw_split; 345 346 btf__parse_split; 346 347 btf__new_empty_split; 347 348 btf__new_split; 349 + xsk_setup_xdp_prog; 350 + xsk_socket__update_xskmap; 348 351 } LIBBPF_0.2.0;

+31

tools/lib/bpf/libbpf_internal.h

··· 151 151 int libbpf__load_raw_btf(const char *raw_types, size_t types_len, 152 152 const char *str_sec, size_t str_len); 153 153 154 + struct bpf_prog_load_params { 155 + enum bpf_prog_type prog_type; 156 + enum bpf_attach_type expected_attach_type; 157 + const char *name; 158 + const struct bpf_insn *insns; 159 + size_t insn_cnt; 160 + const char *license; 161 + __u32 kern_version; 162 + __u32 attach_prog_fd; 163 + __u32 attach_btf_obj_fd; 164 + __u32 attach_btf_id; 165 + __u32 prog_ifindex; 166 + __u32 prog_btf_fd; 167 + __u32 prog_flags; 168 + 169 + __u32 func_info_rec_size; 170 + const void *func_info; 171 + __u32 func_info_cnt; 172 + 173 + __u32 line_info_rec_size; 174 + const void *line_info; 175 + __u32 line_info_cnt; 176 + 177 + __u32 log_level; 178 + char *log_buf; 179 + size_t log_buf_sz; 180 + }; 181 + 182 + int libbpf__bpf_prog_load(const struct bpf_prog_load_params *load_attr); 183 + 154 184 int bpf_object__section_size(const struct bpf_object *obj, const char *name, 155 185 __u32 *size); 156 186 int bpf_object__variable_offset(const struct bpf_object *obj, const char *name, 157 187 __u32 *off); 188 + struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf); 158 189 159 190 struct btf_ext_info { 160 191 /*

+83 -9

tools/lib/bpf/xsk.c

··· 566 566 &xsk->fd, 0); 567 567 } 568 568 569 - static int xsk_setup_xdp_prog(struct xsk_socket *xsk) 569 + static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk) 570 570 { 571 + char ifname[IFNAMSIZ]; 572 + struct xsk_ctx *ctx; 573 + char *interface; 574 + 575 + ctx = calloc(1, sizeof(*ctx)); 576 + if (!ctx) 577 + return -ENOMEM; 578 + 579 + interface = if_indextoname(ifindex, &ifname[0]); 580 + if (!interface) { 581 + free(ctx); 582 + return -errno; 583 + } 584 + 585 + ctx->ifindex = ifindex; 586 + memcpy(ctx->ifname, ifname, IFNAMSIZ -1); 587 + ctx->ifname[IFNAMSIZ - 1] = 0; 588 + 589 + xsk->ctx = ctx; 590 + 591 + return 0; 592 + } 593 + 594 + static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, 595 + int *xsks_map_fd) 596 + { 597 + struct xsk_socket *xsk = _xdp; 571 598 struct xsk_ctx *ctx = xsk->ctx; 572 599 __u32 prog_id = 0; 573 600 int err; ··· 611 584 612 585 err = xsk_load_xdp_prog(xsk); 613 586 if (err) { 614 - xsk_delete_bpf_maps(xsk); 615 - return err; 587 + goto err_load_xdp_prog; 616 588 } 617 589 } else { 618 590 ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id); ··· 624 598 } 625 599 } 626 600 627 - if (xsk->rx) 601 + if (xsk->rx) { 628 602 err = xsk_set_bpf_maps(xsk); 629 - if (err) { 630 - xsk_delete_bpf_maps(xsk); 631 - close(ctx->prog_fd); 632 - return err; 603 + if (err) { 604 + if (!prog_id) { 605 + goto err_set_bpf_maps; 606 + } else { 607 + close(ctx->prog_fd); 608 + return err; 609 + } 610 + } 633 611 } 612 + if (xsks_map_fd) 613 + *xsks_map_fd = ctx->xsks_map_fd; 634 614 635 615 return 0; 616 + 617 + err_set_bpf_maps: 618 + close(ctx->prog_fd); 619 + bpf_set_link_xdp_fd(ctx->ifindex, -1, 0); 620 + err_load_xdp_prog: 621 + xsk_delete_bpf_maps(xsk); 622 + 623 + return err; 636 624 } 637 625 638 626 static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex, ··· 727 687 ctx->comp = comp; 728 688 list_add(&ctx->list, &umem->ctx_list); 729 689 return ctx; 690 + } 691 + 692 + static void xsk_destroy_xsk_struct(struct xsk_socket *xsk) 693 + { 694 + free(xsk->ctx); 695 + free(xsk); 696 + } 697 + 698 + int xsk_socket__update_xskmap(struct xsk_socket *xsk, int fd) 699 + { 700 + xsk->ctx->xsks_map_fd = fd; 701 + return xsk_set_bpf_maps(xsk); 702 + } 703 + 704 + int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd) 705 + { 706 + struct xsk_socket *xsk; 707 + int res; 708 + 709 + xsk = calloc(1, sizeof(*xsk)); 710 + if (!xsk) 711 + return -ENOMEM; 712 + 713 + res = xsk_create_xsk_struct(ifindex, xsk); 714 + if (res) { 715 + free(xsk); 716 + return -EINVAL; 717 + } 718 + 719 + res = __xsk_setup_xdp_prog(xsk, xsks_map_fd); 720 + 721 + xsk_destroy_xsk_struct(xsk); 722 + 723 + return res; 730 724 } 731 725 732 726 int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, ··· 912 838 ctx->prog_fd = -1; 913 839 914 840 if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) { 915 - err = xsk_setup_xdp_prog(xsk); 841 + err = __xsk_setup_xdp_prog(xsk, NULL); 916 842 if (err) 917 843 goto out_mmap_tx; 918 844 }

+15 -7

tools/lib/bpf/xsk.h

··· 113 113 return (entries > nb) ? nb : entries; 114 114 } 115 115 116 - static inline size_t xsk_ring_prod__reserve(struct xsk_ring_prod *prod, 117 - size_t nb, __u32 *idx) 116 + static inline __u32 xsk_ring_prod__reserve(struct xsk_ring_prod *prod, __u32 nb, __u32 *idx) 118 117 { 119 118 if (xsk_prod_nb_free(prod, nb) < nb) 120 119 return 0; ··· 124 125 return nb; 125 126 } 126 127 127 - static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, size_t nb) 128 + static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, __u32 nb) 128 129 { 129 130 /* Make sure everything has been written to the ring before indicating 130 131 * this to the kernel by writing the producer pointer. ··· 134 135 *prod->producer += nb; 135 136 } 136 137 137 - static inline size_t xsk_ring_cons__peek(struct xsk_ring_cons *cons, 138 - size_t nb, __u32 *idx) 138 + static inline __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __u32 *idx) 139 139 { 140 - size_t entries = xsk_cons_nb_avail(cons, nb); 140 + __u32 entries = xsk_cons_nb_avail(cons, nb); 141 141 142 142 if (entries > 0) { 143 143 /* Make sure we do not speculatively read the data before ··· 151 153 return entries; 152 154 } 153 155 154 - static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, size_t nb) 156 + static inline void xsk_ring_cons__cancel(struct xsk_ring_cons *cons, __u32 nb) 157 + { 158 + cons->cached_cons -= nb; 159 + } 160 + 161 + static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, __u32 nb) 155 162 { 156 163 /* Make sure data has been read before indicating we are done 157 164 * with the entries by updating the consumer pointer. ··· 203 200 __u32 frame_headroom; 204 201 __u32 flags; 205 202 }; 203 + 204 + LIBBPF_API int xsk_setup_xdp_prog(int ifindex, 205 + int *xsks_map_fd); 206 + LIBBPF_API int xsk_socket__update_xskmap(struct xsk_socket *xsk, 207 + int xsks_map_fd); 206 208 207 209 /* Flags for the libbpf_flags field. */ 208 210 #define XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD (1 << 0)

+1

tools/testing/selftests/bpf/.gitignore

··· 35 35 /tools 36 36 /runqslower 37 37 /bench 38 + *.ko

+13 -5

tools/testing/selftests/bpf/Makefile

··· 80 80 # Compile but not part of 'make run_tests' 81 81 TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ 82 82 flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ 83 - test_lirc_mode2_user xdping test_cpp runqslower bench 83 + test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko 84 84 85 85 TEST_CUSTOM_PROGS = urandom_read 86 86 ··· 104 104 override define CLEAN 105 105 $(call msg,CLEAN) 106 106 $(Q)$(RM) -r $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(EXTRA_CLEAN) 107 + $(Q)$(MAKE) -C bpf_testmod clean 107 108 endef 108 109 109 110 include ../lib.mk ··· 136 135 $(OUTPUT)/urandom_read: urandom_read.c 137 136 $(call msg,BINARY,,$@) 138 137 $(Q)$(CC) $(LDFLAGS) -o $@ $< $(LDLIBS) -Wl,--build-id=sha1 138 + 139 + $(OUTPUT)/bpf_testmod.ko: $(VMLINUX_BTF) $(wildcard bpf_testmod/Makefile bpf_testmod/*.[ch]) 140 + $(call msg,MOD,,$@) 141 + $(Q)$(MAKE) $(submake_extras) -C bpf_testmod 142 + $(Q)cp bpf_testmod/bpf_testmod.ko $@ 139 143 140 144 $(OUTPUT)/test_stub.o: test_stub.c $(BPFOBJ) 141 145 $(call msg,CC,,$@) ··· 225 219 # build would have failed anyways. 226 220 define get_sys_includes 227 221 $(shell $(1) -v -E - </dev/null 2>&1 \ 228 - | sed -n '/<...> search starts here:/,/End of search list./{ s| $/.*$|-idirafter \1|p }') 222 + | sed -n '/<...> search starts here:/,/End of search list./{ s| $/.*$|-idirafter \1|p }') \ 223 + $(shell $(1) -dM -E - </dev/null | grep '#define __riscv_xlen ' | sed 's/#define /-D/' | sed 's/ /=/') 229 224 endef 230 225 231 226 # Determine target endianness. ··· 384 377 | $(TRUNNER_BINARY)-extras 385 378 $$(call msg,BINARY,,$$@) 386 379 $(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) -o $$@ 387 - $(Q)$(RESOLVE_BTFIDS) --no-fail --btf btf_data.o $$@ 380 + $(Q)$(RESOLVE_BTFIDS) --no-fail --btf $(TRUNNER_OUTPUT)/btf_data.o $$@ 388 381 389 382 endef 390 383 ··· 394 387 TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ 395 388 network_helpers.c testing_helpers.c \ 396 389 btf_helpers.c flow_dissector_load.h 397 - TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ 390 + TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \ 391 + ima_setup.sh \ 398 392 $(wildcard progs/btf_dump_test_case_*.c) 399 393 TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE 400 394 TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) ··· 466 458 EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) \ 467 459 prog_tests/tests.h map_tests/tests.h verifier/tests.h \ 468 460 feature \ 469 - $(addprefix $(OUTPUT)/,*.o *.skel.h no_alu32 bpf_gcc) 461 + $(addprefix $(OUTPUT)/,*.o *.skel.h no_alu32 bpf_gcc bpf_testmod.ko)

+19 -14

tools/testing/selftests/bpf/README.rst

··· 2 2 BPF Selftest Notes 3 3 ================== 4 4 General instructions on running selftests can be found in 5 - `Documentation/bpf/bpf_devel_QA.rst`_. 5 + `Documentation/bpf/bpf_devel_QA.rst`__. 6 + 7 + __ /Documentation/bpf/bpf_devel_QA.rst#q-how-to-run-bpf-selftests 8 + 6 9 7 10 Additional information about selftest failures are 8 11 documented here. ··· 33 30 At insn 18 the r7 is indeed unbounded. The later insn 19 checks the bounds and 34 31 the insn 20 undoes map_value addition. It is currently impossible for the 35 32 verifier to understand such speculative pointer arithmetic. 36 - Hence 37 - https://reviews.llvm.org/D85570 38 - addresses it on the compiler side. It was committed on llvm 12. 33 + Hence `this patch`__ addresses it on the compiler side. It was committed on llvm 12. 34 + 35 + __ https://reviews.llvm.org/D85570 39 36 40 37 The corresponding C code 38 + 41 39 .. code-block:: c 42 40 43 41 for (int i = 0; i < MAX_CGROUPS_PATH_DEPTH; i++) { ··· 81 77 17: (7b) *(u64 *)(r7 +0) = r2 82 78 only read is supported 83 79 84 - This is due to a llvm BPF backend bug. The fix 85 - https://reviews.llvm.org/D78466 80 + This is due to a llvm BPF backend bug. `The fix`__ 86 81 has been pushed to llvm 10.x release branch and will be 87 - available in 10.0.1. The fix is available in llvm 11.0.0 trunk. 82 + available in 10.0.1. The patch is available in llvm 11.0.0 trunk. 83 + 84 + __ https://reviews.llvm.org/D78466 88 85 89 86 BPF CO-RE-based tests and Clang version 90 87 ======================================= ··· 99 94 old to support them, they shouldn't cause build failures or runtime test 100 95 failures: 101 96 102 - - __builtin_btf_type_id() ([0], [1], [2]); 103 - - __builtin_preserve_type_info(), __builtin_preserve_enum_value() ([3], [4]). 97 + - __builtin_btf_type_id() [0_, 1_, 2_]; 98 + - __builtin_preserve_type_info(), __builtin_preserve_enum_value() [3_, 4_]. 104 99 105 - [0] https://reviews.llvm.org/D74572 106 - [1] https://reviews.llvm.org/D74668 107 - [2] https://reviews.llvm.org/D85174 108 - [3] https://reviews.llvm.org/D83878 109 - [4] https://reviews.llvm.org/D83242 100 + .. _0: https://reviews.llvm.org/D74572 101 + .. _1: https://reviews.llvm.org/D74668 102 + .. _2: https://reviews.llvm.org/D85174 103 + .. _3: https://reviews.llvm.org/D83878 104 + .. _4: https://reviews.llvm.org/D83242

+1

tools/testing/selftests/bpf/bpf_tcp_helpers.h

··· 56 56 __u32 rcv_nxt; 57 57 __u32 snd_nxt; 58 58 __u32 snd_una; 59 + __u32 window_clamp; 59 60 __u8 ecn_flags; 60 61 __u32 delivered; 61 62 __u32 delivered_ce;

+6

tools/testing/selftests/bpf/bpf_testmod/.gitignore

··· 1 + *.mod 2 + *.mod.c 3 + *.o 4 + .ko 5 + /Module.symvers 6 + /modules.order

+20

tools/testing/selftests/bpf/bpf_testmod/Makefile

··· 1 + BPF_TESTMOD_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST))))) 2 + KDIR ?= $(abspath $(BPF_TESTMOD_DIR)/../../../../..) 3 + 4 + ifeq ($(V),1) 5 + Q = 6 + else 7 + Q = @ 8 + endif 9 + 10 + MODULES = bpf_testmod.ko 11 + 12 + obj-m += bpf_testmod.o 13 + CFLAGS_bpf_testmod.o = -I$(src) 14 + 15 + all: 16 + +$(Q)make -C $(KDIR) M=$(BPF_TESTMOD_DIR) modules 17 + 18 + clean: 19 + +$(Q)make -C $(KDIR) M=$(BPF_TESTMOD_DIR) clean 20 +

+36

tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* Copyright (c) 2020 Facebook */ 3 + #undef TRACE_SYSTEM 4 + #define TRACE_SYSTEM bpf_testmod 5 + 6 + #if !defined(_BPF_TESTMOD_EVENTS_H) || defined(TRACE_HEADER_MULTI_READ) 7 + #define _BPF_TESTMOD_EVENTS_H 8 + 9 + #include <linux/tracepoint.h> 10 + #include "bpf_testmod.h" 11 + 12 + TRACE_EVENT(bpf_testmod_test_read, 13 + TP_PROTO(struct task_struct *task, struct bpf_testmod_test_read_ctx *ctx), 14 + TP_ARGS(task, ctx), 15 + TP_STRUCT__entry( 16 + __field(pid_t, pid) 17 + __array(char, comm, TASK_COMM_LEN) 18 + __field(loff_t, off) 19 + __field(size_t, len) 20 + ), 21 + TP_fast_assign( 22 + __entry->pid = task->pid; 23 + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); 24 + __entry->off = ctx->off; 25 + __entry->len = ctx->len; 26 + ), 27 + TP_printk("pid=%d comm=%s off=%llu len=%zu", 28 + __entry->pid, __entry->comm, __entry->off, __entry->len) 29 + ); 30 + 31 + #endif /* _BPF_TESTMOD_EVENTS_H */ 32 + 33 + #undef TRACE_INCLUDE_PATH 34 + #define TRACE_INCLUDE_PATH . 35 + #define TRACE_INCLUDE_FILE bpf_testmod-events 36 + #include <trace/define_trace.h>

+52

tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + #include <linux/error-injection.h> 4 + #include <linux/init.h> 5 + #include <linux/module.h> 6 + #include <linux/sysfs.h> 7 + #include <linux/tracepoint.h> 8 + #include "bpf_testmod.h" 9 + 10 + #define CREATE_TRACE_POINTS 11 + #include "bpf_testmod-events.h" 12 + 13 + noinline ssize_t 14 + bpf_testmod_test_read(struct file *file, struct kobject *kobj, 15 + struct bin_attribute *bin_attr, 16 + char *buf, loff_t off, size_t len) 17 + { 18 + struct bpf_testmod_test_read_ctx ctx = { 19 + .buf = buf, 20 + .off = off, 21 + .len = len, 22 + }; 23 + 24 + trace_bpf_testmod_test_read(current, &ctx); 25 + 26 + return -EIO; /* always fail */ 27 + } 28 + EXPORT_SYMBOL(bpf_testmod_test_read); 29 + ALLOW_ERROR_INJECTION(bpf_testmod_test_read, ERRNO); 30 + 31 + static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = { 32 + .attr = { .name = "bpf_testmod", .mode = 0444, }, 33 + .read = bpf_testmod_test_read, 34 + }; 35 + 36 + static int bpf_testmod_init(void) 37 + { 38 + return sysfs_create_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); 39 + } 40 + 41 + static void bpf_testmod_exit(void) 42 + { 43 + return sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); 44 + } 45 + 46 + module_init(bpf_testmod_init); 47 + module_exit(bpf_testmod_exit); 48 + 49 + MODULE_AUTHOR("Andrii Nakryiko"); 50 + MODULE_DESCRIPTION("BPF selftests module"); 51 + MODULE_LICENSE("Dual BSD/GPL"); 52 +

+14

tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* Copyright (c) 2020 Facebook */ 3 + #ifndef _BPF_TESTMOD_H 4 + #define _BPF_TESTMOD_H 5 + 6 + #include <linux/types.h> 7 + 8 + struct bpf_testmod_test_read_ctx { 9 + char *buf; 10 + loff_t off; 11 + size_t len; 12 + }; 13 + 14 + #endif /* _BPF_TESTMOD_H */

+5

tools/testing/selftests/bpf/config

··· 39 39 CONFIG_BPF_LSM=y 40 40 CONFIG_SECURITY=y 41 41 CONFIG_LIRC=y 42 + CONFIG_IMA=y 43 + CONFIG_SECURITYFS=y 44 + CONFIG_IMA_WRITE_POLICY=y 45 + CONFIG_IMA_READ_POLICY=y 46 + CONFIG_BLK_DEV_LOOP=y

+99

tools/testing/selftests/bpf/ima_setup.sh

··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + set -e 5 + set -u 6 + set -o pipefail 7 + 8 + IMA_POLICY_FILE="/sys/kernel/security/ima/policy" 9 + TEST_BINARY="/bin/true" 10 + 11 + usage() 12 + { 13 + echo "Usage: $0 <setup|cleanup|run> <existing_tmp_dir>" 14 + exit 1 15 + } 16 + 17 + ensure_mount_securityfs() 18 + { 19 + local securityfs_dir=$(grep "securityfs" /proc/mounts | awk '{print $2}') 20 + 21 + if [ -z "${securityfs_dir}" ]; then 22 + securityfs_dir=/sys/kernel/security 23 + mount -t securityfs security "${securityfs_dir}" 24 + fi 25 + 26 + if [ ! -d "${securityfs_dir}" ]; then 27 + echo "${securityfs_dir}: securityfs is not mounted" && exit 1 28 + fi 29 + } 30 + 31 + setup() 32 + { 33 + local tmp_dir="$1" 34 + local mount_img="${tmp_dir}/test.img" 35 + local mount_dir="${tmp_dir}/mnt" 36 + local copied_bin_path="${mount_dir}/$(basename ${TEST_BINARY})" 37 + mkdir -p ${mount_dir} 38 + 39 + dd if=/dev/zero of="${mount_img}" bs=1M count=10 40 + 41 + losetup -f "${mount_img}" 42 + local loop_device=$(losetup -a | grep ${mount_img:?} | cut -d ":" -f1) 43 + 44 + mkfs.ext2 "${loop_device:?}" 45 + mount "${loop_device}" "${mount_dir}" 46 + 47 + cp "${TEST_BINARY}" "${mount_dir}" 48 + local mount_uuid="$(blkid ${loop_device} | sed 's/.*UUID="$[^"]*$".*/\1/')" 49 + 50 + ensure_mount_securityfs 51 + echo "measure func=BPRM_CHECK fsuuid=${mount_uuid}" > ${IMA_POLICY_FILE} 52 + } 53 + 54 + cleanup() { 55 + local tmp_dir="$1" 56 + local mount_img="${tmp_dir}/test.img" 57 + local mount_dir="${tmp_dir}/mnt" 58 + 59 + local loop_devices=$(losetup -a | grep ${mount_img:?} | cut -d ":" -f1) 60 + 61 + for loop_dev in "${loop_devices}"; do 62 + losetup -d $loop_dev 63 + done 64 + 65 + umount ${mount_dir} 66 + rm -rf ${tmp_dir} 67 + } 68 + 69 + run() 70 + { 71 + local tmp_dir="$1" 72 + local mount_dir="${tmp_dir}/mnt" 73 + local copied_bin_path="${mount_dir}/$(basename ${TEST_BINARY})" 74 + 75 + exec "${copied_bin_path}" 76 + } 77 + 78 + main() 79 + { 80 + [[ $# -ne 2 ]] && usage 81 + 82 + local action="$1" 83 + local tmp_dir="$2" 84 + 85 + [[ ! -d "${tmp_dir}" ]] && echo "Directory ${tmp_dir} doesn't exist" && exit 1 86 + 87 + if [[ "${action}" == "setup" ]]; then 88 + setup "${tmp_dir}" 89 + elif [[ "${action}" == "cleanup" ]]; then 90 + cleanup "${tmp_dir}" 91 + elif [[ "${action}" == "run" ]]; then 92 + run "${tmp_dir}" 93 + else 94 + echo "Unknown action: ${action}" 95 + exit 1 96 + fi 97 + } 98 + 99 + main "$@"

+69 -11

tools/testing/selftests/bpf/prog_tests/core_reloc.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <test_progs.h> 3 3 #include "progs/core_reloc_types.h" 4 + #include "bpf_testmod/bpf_testmod.h" 4 5 #include <sys/mman.h> 5 6 #include <sys/syscall.h> 6 7 #include <bpf/btf.h> ··· 9 8 static int duration = 0; 10 9 11 10 #define STRUCT_TO_CHAR_PTR(struct_name) (const char *)&(struct struct_name) 11 + 12 + #define MODULES_CASE(name, sec_name, tp_name) { \ 13 + .case_name = name, \ 14 + .bpf_obj_file = "test_core_reloc_module.o", \ 15 + .btf_src_file = NULL, /* find in kernel module BTFs */ \ 16 + .input = "", \ 17 + .input_len = 0, \ 18 + .output = STRUCT_TO_CHAR_PTR(core_reloc_module_output) { \ 19 + .read_ctx_sz = sizeof(struct bpf_testmod_test_read_ctx),\ 20 + .read_ctx_exists = true, \ 21 + .buf_exists = true, \ 22 + .len_exists = true, \ 23 + .off_exists = true, \ 24 + .len = 123, \ 25 + .off = 0, \ 26 + .comm = "test_progs", \ 27 + .comm_len = sizeof("test_progs"), \ 28 + }, \ 29 + .output_len = sizeof(struct core_reloc_module_output), \ 30 + .prog_sec_name = sec_name, \ 31 + .raw_tp_name = tp_name, \ 32 + .trigger = trigger_module_test_read, \ 33 + .needs_testmod = true, \ 34 + } 12 35 13 36 #define FLAVORS_DATA(struct_name) STRUCT_TO_CHAR_PTR(struct_name) { \ 14 37 .a = 42, \ ··· 236 211 .output = STRUCT_TO_CHAR_PTR(core_reloc_bitfields_output) \ 237 212 __VA_ARGS__, \ 238 213 .output_len = sizeof(struct core_reloc_bitfields_output), \ 239 - .direct_raw_tp = true, \ 214 + .prog_sec_name = "tp_btf/sys_enter", \ 240 215 } 241 216 242 217 ··· 247 222 }, { \ 248 223 BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_direct.o", \ 249 224 "direct:", name), \ 250 - .direct_raw_tp = true, \ 225 + .prog_sec_name = "tp_btf/sys_enter", \ 251 226 .fails = true, \ 252 227 } 253 228 ··· 334 309 struct core_reloc_test_case; 335 310 336 311 typedef int (*setup_test_fn)(struct core_reloc_test_case *test); 312 + typedef int (*trigger_test_fn)(const struct core_reloc_test_case *test); 337 313 338 314 struct core_reloc_test_case { 339 315 const char *case_name; ··· 345 319 const char *output; 346 320 int output_len; 347 321 bool fails; 322 + bool needs_testmod; 348 323 bool relaxed_core_relocs; 349 - bool direct_raw_tp; 324 + const char *prog_sec_name; 325 + const char *raw_tp_name; 350 326 setup_test_fn setup; 327 + trigger_test_fn trigger; 351 328 }; 352 329 353 330 static int find_btf_type(const struct btf *btf, const char *name, __u32 kind) ··· 480 451 return 0; 481 452 } 482 453 454 + static int trigger_module_test_read(const struct core_reloc_test_case *test) 455 + { 456 + struct core_reloc_module_output *exp = (void *)test->output; 457 + int fd, err; 458 + 459 + fd = open("/sys/kernel/bpf_testmod", O_RDONLY); 460 + err = -errno; 461 + if (CHECK(fd < 0, "testmod_file_open", "failed: %d\n", err)) 462 + return err; 463 + 464 + read(fd, NULL, exp->len); /* request expected number of bytes */ 465 + close(fd); 466 + 467 + return 0; 468 + } 469 + 470 + 483 471 static struct core_reloc_test_case test_cases[] = { 484 472 /* validate we can find kernel image and use its BTF for relocs */ 485 473 { ··· 512 466 }, 513 467 .output_len = sizeof(struct core_reloc_kernel_output), 514 468 }, 469 + 470 + /* validate we can find kernel module BTF types for relocs/attach */ 471 + MODULES_CASE("module_probed", "raw_tp/bpf_testmod_test_read", "bpf_testmod_test_read"), 472 + MODULES_CASE("module_direct", "tp_btf/bpf_testmod_test_read", NULL), 515 473 516 474 /* validate BPF program can use multiple flavors to match against 517 475 * single target BTF type ··· 829 779 if (!test__start_subtest(test_case->case_name)) 830 780 continue; 831 781 782 + if (test_case->needs_testmod && !env.has_testmod) { 783 + test__skip(); 784 + continue; 785 + } 786 + 832 787 if (test_case->setup) { 833 788 err = test_case->setup(test_case); 834 789 if (CHECK(err, "test_setup", "test #%d setup failed: %d\n", i, err)) ··· 845 790 test_case->bpf_obj_file, PTR_ERR(obj))) 846 791 continue; 847 792 848 - /* for typed raw tracepoints, NULL should be specified */ 849 - if (test_case->direct_raw_tp) { 850 - probe_name = "tp_btf/sys_enter"; 851 - tp_name = NULL; 852 - } else { 853 - probe_name = "raw_tracepoint/sys_enter"; 854 - tp_name = "sys_enter"; 793 + probe_name = "raw_tracepoint/sys_enter"; 794 + tp_name = "sys_enter"; 795 + if (test_case->prog_sec_name) { 796 + probe_name = test_case->prog_sec_name; 797 + tp_name = test_case->raw_tp_name; /* NULL for tp_btf */ 855 798 } 856 799 857 800 prog = bpf_object__find_program_by_title(obj, probe_name); ··· 890 837 goto cleanup; 891 838 892 839 /* trigger test run */ 893 - usleep(1); 840 + if (test_case->trigger) { 841 + if (!ASSERT_OK(test_case->trigger(test_case), "test_trigger")) 842 + goto cleanup; 843 + } else { 844 + usleep(1); 845 + } 894 846 895 847 if (data->skip) { 896 848 test__skip();

+53

tools/testing/selftests/bpf/prog_tests/module_attach.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + 4 + #include <test_progs.h> 5 + #include "test_module_attach.skel.h" 6 + 7 + static int duration; 8 + 9 + static int trigger_module_test_read(int read_sz) 10 + { 11 + int fd, err; 12 + 13 + fd = open("/sys/kernel/bpf_testmod", O_RDONLY); 14 + err = -errno; 15 + if (CHECK(fd < 0, "testmod_file_open", "failed: %d\n", err)) 16 + return err; 17 + 18 + read(fd, NULL, read_sz); 19 + close(fd); 20 + 21 + return 0; 22 + } 23 + 24 + void test_module_attach(void) 25 + { 26 + const int READ_SZ = 456; 27 + struct test_module_attach* skel; 28 + struct test_module_attach__bss *bss; 29 + int err; 30 + 31 + skel = test_module_attach__open_and_load(); 32 + if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) 33 + return; 34 + 35 + bss = skel->bss; 36 + 37 + err = test_module_attach__attach(skel); 38 + if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) 39 + goto cleanup; 40 + 41 + /* trigger tracepoint */ 42 + ASSERT_OK(trigger_module_test_read(READ_SZ), "trigger_read"); 43 + 44 + ASSERT_EQ(bss->raw_tp_read_sz, READ_SZ, "raw_tp"); 45 + ASSERT_EQ(bss->tp_btf_read_sz, READ_SZ, "tp_btf"); 46 + ASSERT_EQ(bss->fentry_read_sz, READ_SZ, "fentry"); 47 + ASSERT_EQ(bss->fexit_read_sz, READ_SZ, "fexit"); 48 + ASSERT_EQ(bss->fexit_ret, -EIO, "fexit_tet"); 49 + ASSERT_EQ(bss->fmod_ret_read_sz, READ_SZ, "fmod_ret"); 50 + 51 + cleanup: 52 + test_module_attach__destroy(skel); 53 + }

+4

tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c

··· 42 42 43 43 /* check getsockopt for SAVED_SYN */ 44 44 ASSERT_EQ(result->tcp_saved_syn, 1, "tcp_saved_syn"); 45 + 46 + /* check getsockopt for window_clamp */ 47 + ASSERT_EQ(result->window_clamp_client, 9216, "window_clamp_client"); 48 + ASSERT_EQ(result->window_clamp_server, 9216, "window_clamp_server"); 45 49 } 46 50 47 51 static void run_test(struct tcpbpf_globals *result)

+116

tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + /* 4 + * Copyright (C) 2020 Google LLC. 5 + */ 6 + 7 + #include <test_progs.h> 8 + #include <linux/limits.h> 9 + 10 + #include "bprm_opts.skel.h" 11 + #include "network_helpers.h" 12 + 13 + #ifndef __NR_pidfd_open 14 + #define __NR_pidfd_open 434 15 + #endif 16 + 17 + static const char * const bash_envp[] = { "TMPDIR=shouldnotbeset", NULL }; 18 + 19 + static inline int sys_pidfd_open(pid_t pid, unsigned int flags) 20 + { 21 + return syscall(__NR_pidfd_open, pid, flags); 22 + } 23 + 24 + static int update_storage(int map_fd, int secureexec) 25 + { 26 + int task_fd, ret = 0; 27 + 28 + task_fd = sys_pidfd_open(getpid(), 0); 29 + if (task_fd < 0) 30 + return errno; 31 + 32 + ret = bpf_map_update_elem(map_fd, &task_fd, &secureexec, BPF_NOEXIST); 33 + if (ret) 34 + ret = errno; 35 + 36 + close(task_fd); 37 + return ret; 38 + } 39 + 40 + static int run_set_secureexec(int map_fd, int secureexec) 41 + { 42 + int child_pid, child_status, ret, null_fd; 43 + 44 + child_pid = fork(); 45 + if (child_pid == 0) { 46 + null_fd = open("/dev/null", O_WRONLY); 47 + if (null_fd == -1) 48 + exit(errno); 49 + dup2(null_fd, STDOUT_FILENO); 50 + dup2(null_fd, STDERR_FILENO); 51 + close(null_fd); 52 + 53 + /* Ensure that all executions from hereon are 54 + * secure by setting a local storage which is read by 55 + * the bprm_creds_for_exec hook and sets bprm->secureexec. 56 + */ 57 + ret = update_storage(map_fd, secureexec); 58 + if (ret) 59 + exit(ret); 60 + 61 + /* If the binary is executed with securexec=1, the dynamic 62 + * loader ingores and unsets certain variables like LD_PRELOAD, 63 + * TMPDIR etc. TMPDIR is used here to simplify the example, as 64 + * LD_PRELOAD requires a real .so file. 65 + * 66 + * If the value of TMPDIR is set, the bash command returns 10 67 + * and if the value is unset, it returns 20. 68 + */ 69 + execle("/bin/bash", "bash", "-c", 70 + "[[ -z \"${TMPDIR}\" ]] || exit 10 && exit 20", NULL, 71 + bash_envp); 72 + exit(errno); 73 + } else if (child_pid > 0) { 74 + waitpid(child_pid, &child_status, 0); 75 + ret = WEXITSTATUS(child_status); 76 + 77 + /* If a secureexec occurred, the exit status should be 20 */ 78 + if (secureexec && ret == 20) 79 + return 0; 80 + 81 + /* If normal execution happened, the exit code should be 10 */ 82 + if (!secureexec && ret == 10) 83 + return 0; 84 + } 85 + 86 + return -EINVAL; 87 + } 88 + 89 + void test_test_bprm_opts(void) 90 + { 91 + int err, duration = 0; 92 + struct bprm_opts *skel = NULL; 93 + 94 + skel = bprm_opts__open_and_load(); 95 + if (CHECK(!skel, "skel_load", "skeleton failed\n")) 96 + goto close_prog; 97 + 98 + err = bprm_opts__attach(skel); 99 + if (CHECK(err, "attach", "attach failed: %d\n", err)) 100 + goto close_prog; 101 + 102 + /* Run the test with the secureexec bit unset */ 103 + err = run_set_secureexec(bpf_map__fd(skel->maps.secure_exec_task_map), 104 + 0 /* secureexec */); 105 + if (CHECK(err, "run_set_secureexec:0", "err = %d\n", err)) 106 + goto close_prog; 107 + 108 + /* Run the test with the secureexec bit set */ 109 + err = run_set_secureexec(bpf_map__fd(skel->maps.secure_exec_task_map), 110 + 1 /* secureexec */); 111 + if (CHECK(err, "run_set_secureexec:1", "err = %d\n", err)) 112 + goto close_prog; 113 + 114 + close_prog: 115 + bprm_opts__destroy(skel); 116 + }

+74

tools/testing/selftests/bpf/prog_tests/test_ima.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + /* 4 + * Copyright (C) 2020 Google LLC. 5 + */ 6 + 7 + #include <stdio.h> 8 + #include <stdlib.h> 9 + #include <unistd.h> 10 + #include <sys/wait.h> 11 + #include <test_progs.h> 12 + 13 + #include "ima.skel.h" 14 + 15 + static int run_measured_process(const char *measured_dir, u32 *monitored_pid) 16 + { 17 + int child_pid, child_status; 18 + 19 + child_pid = fork(); 20 + if (child_pid == 0) { 21 + *monitored_pid = getpid(); 22 + execlp("./ima_setup.sh", "./ima_setup.sh", "run", measured_dir, 23 + NULL); 24 + exit(errno); 25 + 26 + } else if (child_pid > 0) { 27 + waitpid(child_pid, &child_status, 0); 28 + return WEXITSTATUS(child_status); 29 + } 30 + 31 + return -EINVAL; 32 + } 33 + 34 + void test_test_ima(void) 35 + { 36 + char measured_dir_template[] = "/tmp/ima_measuredXXXXXX"; 37 + const char *measured_dir; 38 + char cmd[256]; 39 + 40 + int err, duration = 0; 41 + struct ima *skel = NULL; 42 + 43 + skel = ima__open_and_load(); 44 + if (CHECK(!skel, "skel_load", "skeleton failed\n")) 45 + goto close_prog; 46 + 47 + err = ima__attach(skel); 48 + if (CHECK(err, "attach", "attach failed: %d\n", err)) 49 + goto close_prog; 50 + 51 + measured_dir = mkdtemp(measured_dir_template); 52 + if (CHECK(measured_dir == NULL, "mkdtemp", "err %d\n", errno)) 53 + goto close_prog; 54 + 55 + snprintf(cmd, sizeof(cmd), "./ima_setup.sh setup %s", measured_dir); 56 + if (CHECK_FAIL(system(cmd))) 57 + goto close_clean; 58 + 59 + err = run_measured_process(measured_dir, &skel->bss->monitored_pid); 60 + if (CHECK(err, "run_measured_process", "err = %d\n", err)) 61 + goto close_clean; 62 + 63 + CHECK(skel->data->ima_hash_ret < 0, "ima_hash_ret", 64 + "ima_hash_ret = %ld\n", skel->data->ima_hash_ret); 65 + 66 + CHECK(skel->bss->ima_hash == 0, "ima_hash", 67 + "ima_hash = %lu\n", skel->bss->ima_hash); 68 + 69 + close_clean: 70 + snprintf(cmd, sizeof(cmd), "./ima_setup.sh cleanup %s", measured_dir); 71 + CHECK_FAIL(system(cmd)); 72 + close_prog: 73 + ima__destroy(skel); 74 + }

+18 -10

tools/testing/selftests/bpf/prog_tests/test_local_storage.c

··· 21 21 return syscall(__NR_pidfd_open, pid, flags); 22 22 } 23 23 24 - static inline ssize_t copy_file_range(int fd_in, loff_t *off_in, int fd_out, 25 - loff_t *off_out, size_t len, 26 - unsigned int flags) 27 - { 28 - return syscall(__NR_copy_file_range, fd_in, off_in, fd_out, off_out, 29 - len, flags); 30 - } 31 - 32 24 static unsigned int duration; 33 25 34 26 #define TEST_STORAGE_VALUE 0xbeefdead ··· 39 47 { 40 48 int fd_in, fd_out = -1, ret = 0; 41 49 struct stat stat; 50 + char *buf = NULL; 42 51 43 52 fd_in = open("/bin/rm", O_RDONLY); 44 53 if (fd_in < 0) ··· 57 64 goto out; 58 65 } 59 66 60 - ret = copy_file_range(fd_in, NULL, fd_out, NULL, stat.st_size, 0); 61 - if (ret == -1) { 67 + buf = malloc(stat.st_blksize); 68 + if (!buf) { 62 69 ret = -errno; 63 70 goto out; 71 + } 72 + 73 + while (ret = read(fd_in, buf, stat.st_blksize), ret > 0) { 74 + ret = write(fd_out, buf, ret); 75 + if (ret < 0) { 76 + ret = -errno; 77 + goto out; 78 + 79 + } 80 + } 81 + if (ret < 0) { 82 + ret = -errno; 83 + goto out; 84 + 64 85 } 65 86 66 87 /* Set executable permission on the copied file */ ··· 83 76 ret = -errno; 84 77 85 78 out: 79 + free(buf); 86 80 close(fd_in); 87 81 close(fd_out); 88 82 return ret;

+102

tools/testing/selftests/bpf/progs/bind4_prog.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <string.h> 4 + 5 + #include <linux/stddef.h> 6 + #include <linux/bpf.h> 7 + #include <linux/in.h> 8 + #include <linux/in6.h> 9 + #include <sys/socket.h> 10 + #include <netinet/tcp.h> 11 + #include <linux/if.h> 12 + #include <errno.h> 13 + 14 + #include <bpf/bpf_helpers.h> 15 + #include <bpf/bpf_endian.h> 16 + 17 + #define SERV4_IP 0xc0a801feU /* 192.168.1.254 */ 18 + #define SERV4_PORT 4040 19 + #define SERV4_REWRITE_IP 0x7f000001U /* 127.0.0.1 */ 20 + #define SERV4_REWRITE_PORT 4444 21 + 22 + #ifndef IFNAMSIZ 23 + #define IFNAMSIZ 16 24 + #endif 25 + 26 + static __inline int bind_to_device(struct bpf_sock_addr *ctx) 27 + { 28 + char veth1[IFNAMSIZ] = "test_sock_addr1"; 29 + char veth2[IFNAMSIZ] = "test_sock_addr2"; 30 + char missing[IFNAMSIZ] = "nonexistent_dev"; 31 + char del_bind[IFNAMSIZ] = ""; 32 + 33 + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, 34 + &veth1, sizeof(veth1))) 35 + return 1; 36 + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, 37 + &veth2, sizeof(veth2))) 38 + return 1; 39 + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, 40 + &missing, sizeof(missing)) != -ENODEV) 41 + return 1; 42 + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, 43 + &del_bind, sizeof(del_bind))) 44 + return 1; 45 + 46 + return 0; 47 + } 48 + 49 + SEC("cgroup/bind4") 50 + int bind_v4_prog(struct bpf_sock_addr *ctx) 51 + { 52 + struct bpf_sock *sk; 53 + __u32 user_ip4; 54 + __u16 user_port; 55 + 56 + sk = ctx->sk; 57 + if (!sk) 58 + return 0; 59 + 60 + if (sk->family != AF_INET) 61 + return 0; 62 + 63 + if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM) 64 + return 0; 65 + 66 + if (ctx->user_ip4 != bpf_htonl(SERV4_IP) || 67 + ctx->user_port != bpf_htons(SERV4_PORT)) 68 + return 0; 69 + 70 + // u8 narrow loads: 71 + user_ip4 = 0; 72 + user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[0] << 0; 73 + user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[1] << 8; 74 + user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[2] << 16; 75 + user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[3] << 24; 76 + if (ctx->user_ip4 != user_ip4) 77 + return 0; 78 + 79 + user_port = 0; 80 + user_port |= ((volatile __u8 *)&ctx->user_port)[0] << 0; 81 + user_port |= ((volatile __u8 *)&ctx->user_port)[1] << 8; 82 + if (ctx->user_port != user_port) 83 + return 0; 84 + 85 + // u16 narrow loads: 86 + user_ip4 = 0; 87 + user_ip4 |= ((volatile __u16 *)&ctx->user_ip4)[0] << 0; 88 + user_ip4 |= ((volatile __u16 *)&ctx->user_ip4)[1] << 16; 89 + if (ctx->user_ip4 != user_ip4) 90 + return 0; 91 + 92 + /* Bind to device and unbind it. */ 93 + if (bind_to_device(ctx)) 94 + return 0; 95 + 96 + ctx->user_ip4 = bpf_htonl(SERV4_REWRITE_IP); 97 + ctx->user_port = bpf_htons(SERV4_REWRITE_PORT); 98 + 99 + return 1; 100 + } 101 + 102 + char _license[] SEC("license") = "GPL";

+119

tools/testing/selftests/bpf/progs/bind6_prog.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <string.h> 4 + 5 + #include <linux/stddef.h> 6 + #include <linux/bpf.h> 7 + #include <linux/in.h> 8 + #include <linux/in6.h> 9 + #include <sys/socket.h> 10 + #include <netinet/tcp.h> 11 + #include <linux/if.h> 12 + #include <errno.h> 13 + 14 + #include <bpf/bpf_helpers.h> 15 + #include <bpf/bpf_endian.h> 16 + 17 + #define SERV6_IP_0 0xfaceb00c /* face:b00c:1234:5678::abcd */ 18 + #define SERV6_IP_1 0x12345678 19 + #define SERV6_IP_2 0x00000000 20 + #define SERV6_IP_3 0x0000abcd 21 + #define SERV6_PORT 6060 22 + #define SERV6_REWRITE_IP_0 0x00000000 23 + #define SERV6_REWRITE_IP_1 0x00000000 24 + #define SERV6_REWRITE_IP_2 0x00000000 25 + #define SERV6_REWRITE_IP_3 0x00000001 26 + #define SERV6_REWRITE_PORT 6666 27 + 28 + #ifndef IFNAMSIZ 29 + #define IFNAMSIZ 16 30 + #endif 31 + 32 + static __inline int bind_to_device(struct bpf_sock_addr *ctx) 33 + { 34 + char veth1[IFNAMSIZ] = "test_sock_addr1"; 35 + char veth2[IFNAMSIZ] = "test_sock_addr2"; 36 + char missing[IFNAMSIZ] = "nonexistent_dev"; 37 + char del_bind[IFNAMSIZ] = ""; 38 + 39 + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, 40 + &veth1, sizeof(veth1))) 41 + return 1; 42 + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, 43 + &veth2, sizeof(veth2))) 44 + return 1; 45 + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, 46 + &missing, sizeof(missing)) != -ENODEV) 47 + return 1; 48 + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, 49 + &del_bind, sizeof(del_bind))) 50 + return 1; 51 + 52 + return 0; 53 + } 54 + 55 + SEC("cgroup/bind6") 56 + int bind_v6_prog(struct bpf_sock_addr *ctx) 57 + { 58 + struct bpf_sock *sk; 59 + __u32 user_ip6; 60 + __u16 user_port; 61 + int i; 62 + 63 + sk = ctx->sk; 64 + if (!sk) 65 + return 0; 66 + 67 + if (sk->family != AF_INET6) 68 + return 0; 69 + 70 + if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM) 71 + return 0; 72 + 73 + if (ctx->user_ip6[0] != bpf_htonl(SERV6_IP_0) || 74 + ctx->user_ip6[1] != bpf_htonl(SERV6_IP_1) || 75 + ctx->user_ip6[2] != bpf_htonl(SERV6_IP_2) || 76 + ctx->user_ip6[3] != bpf_htonl(SERV6_IP_3) || 77 + ctx->user_port != bpf_htons(SERV6_PORT)) 78 + return 0; 79 + 80 + // u8 narrow loads: 81 + for (i = 0; i < 4; i++) { 82 + user_ip6 = 0; 83 + user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[0] << 0; 84 + user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[1] << 8; 85 + user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[2] << 16; 86 + user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[3] << 24; 87 + if (ctx->user_ip6[i] != user_ip6) 88 + return 0; 89 + } 90 + 91 + user_port = 0; 92 + user_port |= ((volatile __u8 *)&ctx->user_port)[0] << 0; 93 + user_port |= ((volatile __u8 *)&ctx->user_port)[1] << 8; 94 + if (ctx->user_port != user_port) 95 + return 0; 96 + 97 + // u16 narrow loads: 98 + for (i = 0; i < 4; i++) { 99 + user_ip6 = 0; 100 + user_ip6 |= ((volatile __u16 *)&ctx->user_ip6[i])[0] << 0; 101 + user_ip6 |= ((volatile __u16 *)&ctx->user_ip6[i])[1] << 16; 102 + if (ctx->user_ip6[i] != user_ip6) 103 + return 0; 104 + } 105 + 106 + /* Bind to device and unbind it. */ 107 + if (bind_to_device(ctx)) 108 + return 0; 109 + 110 + ctx->user_ip6[0] = bpf_htonl(SERV6_REWRITE_IP_0); 111 + ctx->user_ip6[1] = bpf_htonl(SERV6_REWRITE_IP_1); 112 + ctx->user_ip6[2] = bpf_htonl(SERV6_REWRITE_IP_2); 113 + ctx->user_ip6[3] = bpf_htonl(SERV6_REWRITE_IP_3); 114 + ctx->user_port = bpf_htons(SERV6_REWRITE_PORT); 115 + 116 + return 1; 117 + } 118 + 119 + char _license[] SEC("license") = "GPL";

+2

tools/testing/selftests/bpf/progs/bpf_flow.c

··· 368 368 */ 369 369 if (!(keys->flags & BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) 370 370 return export_flow_keys(keys, BPF_OK); 371 + } else { 372 + return export_flow_keys(keys, BPF_OK); 371 373 } 372 374 373 375 return parse_ipv6_proto(skb, fragh->nexthdr);

+1 -1

tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c

··· 23 23 24 24 BPF_SEQ_PRINTF(seq, "%8u %8ld %8ld %10lu\n", map->id, map->refcnt.counter, 25 25 map->usercnt.counter, 26 - map->memory.user->locked_vm.counter); 26 + 0LLU); 27 27 return 0; 28 28 }

+34

tools/testing/selftests/bpf/progs/bprm_opts.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + /* 4 + * Copyright 2020 Google LLC. 5 + */ 6 + 7 + #include "vmlinux.h" 8 + #include <errno.h> 9 + #include <bpf/bpf_helpers.h> 10 + #include <bpf/bpf_tracing.h> 11 + 12 + char _license[] SEC("license") = "GPL"; 13 + 14 + struct { 15 + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 16 + __uint(map_flags, BPF_F_NO_PREALLOC); 17 + __type(key, int); 18 + __type(value, int); 19 + } secure_exec_task_map SEC(".maps"); 20 + 21 + SEC("lsm/bprm_creds_for_exec") 22 + int BPF_PROG(secure_exec, struct linux_binprm *bprm) 23 + { 24 + int *secureexec; 25 + 26 + secureexec = bpf_task_storage_get(&secure_exec_task_map, 27 + bpf_get_current_task_btf(), 0, 28 + BPF_LOCAL_STORAGE_GET_F_CREATE); 29 + 30 + if (secureexec && *secureexec) 31 + bpf_bprm_opts_set(bprm, BPF_F_BPRM_SECUREEXEC); 32 + 33 + return 0; 34 + }

+17

tools/testing/selftests/bpf/progs/core_reloc_types.h

··· 16 16 }; 17 17 18 18 /* 19 + * MODULE 20 + */ 21 + 22 + struct core_reloc_module_output { 23 + long long len; 24 + long long off; 25 + int read_ctx_sz; 26 + bool read_ctx_exists; 27 + bool buf_exists; 28 + bool len_exists; 29 + bool off_exists; 30 + /* we have test_progs[-flavor], so cut flavor part */ 31 + char comm[sizeof("test_progs")]; 32 + int comm_len; 33 + }; 34 + 35 + /* 19 36 * FLAVORS 20 37 */ 21 38 struct core_reloc_flavors {

+28

tools/testing/selftests/bpf/progs/ima.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + /* 4 + * Copyright 2020 Google LLC. 5 + */ 6 + 7 + #include "vmlinux.h" 8 + #include <errno.h> 9 + #include <bpf/bpf_helpers.h> 10 + #include <bpf/bpf_tracing.h> 11 + 12 + long ima_hash_ret = -1; 13 + u64 ima_hash = 0; 14 + u32 monitored_pid = 0; 15 + 16 + char _license[] SEC("license") = "GPL"; 17 + 18 + SEC("lsm.s/bprm_committed_creds") 19 + int BPF_PROG(ima, struct linux_binprm *bprm) 20 + { 21 + u32 pid = bpf_get_current_pid_tgid() >> 32; 22 + 23 + if (pid == monitored_pid) 24 + ima_hash_ret = bpf_ima_inode_hash(bprm->file->f_inode, 25 + &ima_hash, sizeof(ima_hash)); 26 + 27 + return 0; 28 + }

-7

tools/testing/selftests/bpf/progs/map_ptr_kern.c

··· 26 26 return 0; \ 27 27 }) 28 28 29 - struct bpf_map_memory { 30 - __u32 pages; 31 - } __attribute__((preserve_access_index)); 32 - 33 29 struct bpf_map { 34 30 enum bpf_map_type map_type; 35 31 __u32 key_size; 36 32 __u32 value_size; 37 33 __u32 max_entries; 38 34 __u32 id; 39 - struct bpf_map_memory memory; 40 35 } __attribute__((preserve_access_index)); 41 36 42 37 static inline int check_bpf_map_fields(struct bpf_map *map, __u32 key_size, ··· 42 47 VERIFY(map->value_size == value_size); 43 48 VERIFY(map->max_entries == max_entries); 44 49 VERIFY(map->id > 0); 45 - VERIFY(map->memory.pages > 0); 46 50 47 51 return 1; 48 52 } ··· 54 60 VERIFY(indirect->value_size == direct->value_size); 55 61 VERIFY(indirect->max_entries == direct->max_entries); 56 62 VERIFY(indirect->id == direct->id); 57 - VERIFY(indirect->memory.pages == direct->memory.pages); 58 63 59 64 return 1; 60 65 }

+2

tools/testing/selftests/bpf/progs/profiler.inc.h

··· 256 256 BPF_CORE_READ(task, nsproxy, cgroup_ns, root_cset, dfl_cgrp, kn); 257 257 struct kernfs_node* proc_kernfs = BPF_CORE_READ(task, cgroups, dfl_cgrp, kn); 258 258 259 + #if __has_builtin(__builtin_preserve_enum_value) 259 260 if (ENABLE_CGROUP_V1_RESOLVER && CONFIG_CGROUP_PIDS) { 260 261 int cgrp_id = bpf_core_enum_value(enum cgroup_subsys_id___local, 261 262 pids_cgrp_id___local); ··· 276 275 } 277 276 } 278 277 } 278 + #endif 279 279 280 280 cgroup_data->cgroup_root_inode = get_inode_from_kernfs(root_kernfs); 281 281 cgroup_data->cgroup_proc_inode = get_inode_from_kernfs(proc_kernfs);

+96

tools/testing/selftests/bpf/progs/test_core_reloc_module.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + 4 + #include "vmlinux.h" 5 + #include <bpf/bpf_helpers.h> 6 + #include <bpf/bpf_core_read.h> 7 + #include <bpf/bpf_tracing.h> 8 + 9 + char _license[] SEC("license") = "GPL"; 10 + 11 + struct bpf_testmod_test_read_ctx { 12 + /* field order is mixed up */ 13 + size_t len; 14 + char *buf; 15 + loff_t off; 16 + } __attribute__((preserve_access_index)); 17 + 18 + struct { 19 + char in[256]; 20 + char out[256]; 21 + bool skip; 22 + uint64_t my_pid_tgid; 23 + } data = {}; 24 + 25 + struct core_reloc_module_output { 26 + long long len; 27 + long long off; 28 + int read_ctx_sz; 29 + bool read_ctx_exists; 30 + bool buf_exists; 31 + bool len_exists; 32 + bool off_exists; 33 + /* we have test_progs[-flavor], so cut flavor part */ 34 + char comm[sizeof("test_progs")]; 35 + int comm_len; 36 + }; 37 + 38 + SEC("raw_tp/bpf_testmod_test_read") 39 + int BPF_PROG(test_core_module_probed, 40 + struct task_struct *task, 41 + struct bpf_testmod_test_read_ctx *read_ctx) 42 + { 43 + struct core_reloc_module_output *out = (void *)&data.out; 44 + __u64 pid_tgid = bpf_get_current_pid_tgid(); 45 + __u32 real_tgid = (__u32)(pid_tgid >> 32); 46 + __u32 real_pid = (__u32)pid_tgid; 47 + 48 + if (data.my_pid_tgid != pid_tgid) 49 + return 0; 50 + 51 + if (BPF_CORE_READ(task, pid) != real_pid || BPF_CORE_READ(task, tgid) != real_tgid) 52 + return 0; 53 + 54 + out->len = BPF_CORE_READ(read_ctx, len); 55 + out->off = BPF_CORE_READ(read_ctx, off); 56 + 57 + out->read_ctx_sz = bpf_core_type_size(struct bpf_testmod_test_read_ctx); 58 + out->read_ctx_exists = bpf_core_type_exists(struct bpf_testmod_test_read_ctx); 59 + out->buf_exists = bpf_core_field_exists(read_ctx->buf); 60 + out->off_exists = bpf_core_field_exists(read_ctx->off); 61 + out->len_exists = bpf_core_field_exists(read_ctx->len); 62 + 63 + out->comm_len = BPF_CORE_READ_STR_INTO(&out->comm, task, comm); 64 + 65 + return 0; 66 + } 67 + 68 + SEC("tp_btf/bpf_testmod_test_read") 69 + int BPF_PROG(test_core_module_direct, 70 + struct task_struct *task, 71 + struct bpf_testmod_test_read_ctx *read_ctx) 72 + { 73 + struct core_reloc_module_output *out = (void *)&data.out; 74 + __u64 pid_tgid = bpf_get_current_pid_tgid(); 75 + __u32 real_tgid = (__u32)(pid_tgid >> 32); 76 + __u32 real_pid = (__u32)pid_tgid; 77 + 78 + if (data.my_pid_tgid != pid_tgid) 79 + return 0; 80 + 81 + if (task->pid != real_pid || task->tgid != real_tgid) 82 + return 0; 83 + 84 + out->len = read_ctx->len; 85 + out->off = read_ctx->off; 86 + 87 + out->read_ctx_sz = bpf_core_type_size(struct bpf_testmod_test_read_ctx); 88 + out->read_ctx_exists = bpf_core_type_exists(struct bpf_testmod_test_read_ctx); 89 + out->buf_exists = bpf_core_field_exists(read_ctx->buf); 90 + out->off_exists = bpf_core_field_exists(read_ctx->off); 91 + out->len_exists = bpf_core_field_exists(read_ctx->len); 92 + 93 + out->comm_len = BPF_CORE_READ_STR_INTO(&out->comm, task, comm); 94 + 95 + return 0; 96 + }

+66

tools/testing/selftests/bpf/progs/test_module_attach.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + 4 + #include "vmlinux.h" 5 + #include <bpf/bpf_helpers.h> 6 + #include <bpf/bpf_tracing.h> 7 + #include <bpf/bpf_core_read.h> 8 + #include "../bpf_testmod/bpf_testmod.h" 9 + 10 + __u32 raw_tp_read_sz = 0; 11 + 12 + SEC("raw_tp/bpf_testmod_test_read") 13 + int BPF_PROG(handle_raw_tp, 14 + struct task_struct *task, struct bpf_testmod_test_read_ctx *read_ctx) 15 + { 16 + raw_tp_read_sz = BPF_CORE_READ(read_ctx, len); 17 + return 0; 18 + } 19 + 20 + __u32 tp_btf_read_sz = 0; 21 + 22 + SEC("tp_btf/bpf_testmod_test_read") 23 + int BPF_PROG(handle_tp_btf, 24 + struct task_struct *task, struct bpf_testmod_test_read_ctx *read_ctx) 25 + { 26 + tp_btf_read_sz = read_ctx->len; 27 + return 0; 28 + } 29 + 30 + __u32 fentry_read_sz = 0; 31 + 32 + SEC("fentry/bpf_testmod_test_read") 33 + int BPF_PROG(handle_fentry, 34 + struct file *file, struct kobject *kobj, 35 + struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) 36 + { 37 + fentry_read_sz = len; 38 + return 0; 39 + } 40 + 41 + __u32 fexit_read_sz = 0; 42 + int fexit_ret = 0; 43 + 44 + SEC("fexit/bpf_testmod_test_read") 45 + int BPF_PROG(handle_fexit, 46 + struct file *file, struct kobject *kobj, 47 + struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len, 48 + int ret) 49 + { 50 + fexit_read_sz = len; 51 + fexit_ret = ret; 52 + return 0; 53 + } 54 + 55 + __u32 fmod_ret_read_sz = 0; 56 + 57 + SEC("fmod_ret/bpf_testmod_test_read") 58 + int BPF_PROG(handle_fmod_ret, 59 + struct file *file, struct kobject *kobj, 60 + struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) 61 + { 62 + fmod_ret_read_sz = len; 63 + return 0; /* don't override the exit code */ 64 + } 65 + 66 + char _license[] SEC("license") = "GPL";

+33

tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c

··· 12 12 #include <linux/tcp.h> 13 13 #include <bpf/bpf_helpers.h> 14 14 #include <bpf/bpf_endian.h> 15 + #include "bpf_tcp_helpers.h" 15 16 #include "test_tcpbpf.h" 16 17 17 18 struct tcpbpf_globals global = {}; 18 19 int _version SEC("version") = 1; 20 + 21 + /** 22 + * SOL_TCP is defined in <netinet/tcp.h> while 23 + * TCP_SAVED_SYN is defined in already included <linux/tcp.h> 24 + */ 25 + #ifndef SOL_TCP 26 + #define SOL_TCP 6 27 + #endif 28 + 29 + static __always_inline int get_tp_window_clamp(struct bpf_sock_ops *skops) 30 + { 31 + struct bpf_sock *sk; 32 + struct tcp_sock *tp; 33 + 34 + sk = skops->sk; 35 + if (!sk) 36 + return -1; 37 + tp = bpf_skc_to_tcp_sock(sk); 38 + if (!tp) 39 + return -1; 40 + return tp->window_clamp; 41 + } 19 42 20 43 SEC("sockops") 21 44 int bpf_testcb(struct bpf_sock_ops *skops) ··· 46 23 char header[sizeof(struct ipv6hdr) + sizeof(struct tcphdr)]; 47 24 struct bpf_sock_ops *reuse = skops; 48 25 struct tcphdr *thdr; 26 + int window_clamp = 9216; 49 27 int good_call_rv = 0; 50 28 int bad_call_rv = 0; 51 29 int save_syn = 1; ··· 99 75 global.event_map |= (1 << op); 100 76 101 77 switch (op) { 78 + case BPF_SOCK_OPS_TCP_CONNECT_CB: 79 + rv = bpf_setsockopt(skops, SOL_TCP, TCP_WINDOW_CLAMP, 80 + &window_clamp, sizeof(window_clamp)); 81 + global.window_clamp_client = get_tp_window_clamp(skops); 82 + break; 102 83 case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: 103 84 /* Test failure to set largest cb flag (assumes not defined) */ 104 85 global.bad_cb_test_rv = bpf_sock_ops_cb_flags_set(skops, 0x80); ··· 129 100 global.tcp_saved_syn = v; 130 101 } 131 102 } 103 + rv = bpf_setsockopt(skops, SOL_TCP, TCP_WINDOW_CLAMP, 104 + &window_clamp, sizeof(window_clamp)); 105 + 106 + global.window_clamp_server = get_tp_window_clamp(skops); 132 107 break; 133 108 case BPF_SOCK_OPS_RTO_CB: 134 109 break;

+62 -3

tools/testing/selftests/bpf/test_progs.c

··· 149 149 150 150 if (sub_error_cnt) 151 151 env.fail_cnt++; 152 - else 152 + else if (test->skip_cnt == 0) 153 153 env.sub_succ_cnt++; 154 154 skip_account(); 155 155 156 156 dump_test_log(test, sub_error_cnt); 157 157 158 158 fprintf(env.stdout, "#%d/%d %s:%s\n", 159 - test->test_num, test->subtest_num, 160 - test->subtest_name, sub_error_cnt ? "FAIL" : "OK"); 159 + test->test_num, test->subtest_num, test->subtest_name, 160 + sub_error_cnt ? "FAIL" : (test->skip_cnt ? "SKIP" : "OK")); 161 161 162 162 free(test->subtest_name); 163 163 test->subtest_name = NULL; ··· 358 358 err: 359 359 fclose(fp); 360 360 return -1; 361 + } 362 + 363 + static int finit_module(int fd, const char *param_values, int flags) 364 + { 365 + return syscall(__NR_finit_module, fd, param_values, flags); 366 + } 367 + 368 + static int delete_module(const char *name, int flags) 369 + { 370 + return syscall(__NR_delete_module, name, flags); 371 + } 372 + 373 + static void unload_bpf_testmod(void) 374 + { 375 + if (delete_module("bpf_testmod", 0)) { 376 + if (errno == ENOENT) { 377 + if (env.verbosity > VERBOSE_NONE) 378 + fprintf(stdout, "bpf_testmod.ko is already unloaded.\n"); 379 + return; 380 + } 381 + fprintf(env.stderr, "Failed to unload bpf_testmod.ko from kernel: %d\n", -errno); 382 + exit(1); 383 + } 384 + if (env.verbosity > VERBOSE_NONE) 385 + fprintf(stdout, "Successfully unloaded bpf_testmod.ko.\n"); 386 + } 387 + 388 + static int load_bpf_testmod(void) 389 + { 390 + int fd; 391 + 392 + /* ensure previous instance of the module is unloaded */ 393 + unload_bpf_testmod(); 394 + 395 + if (env.verbosity > VERBOSE_NONE) 396 + fprintf(stdout, "Loading bpf_testmod.ko...\n"); 397 + 398 + fd = open("bpf_testmod.ko", O_RDONLY); 399 + if (fd < 0) { 400 + fprintf(env.stderr, "Can't find bpf_testmod.ko kernel module: %d\n", -errno); 401 + return -ENOENT; 402 + } 403 + if (finit_module(fd, "", 0)) { 404 + fprintf(env.stderr, "Failed to load bpf_testmod.ko into the kernel: %d\n", -errno); 405 + close(fd); 406 + return -EINVAL; 407 + } 408 + close(fd); 409 + 410 + if (env.verbosity > VERBOSE_NONE) 411 + fprintf(stdout, "Successfully loaded bpf_testmod.ko.\n"); 412 + return 0; 361 413 } 362 414 363 415 /* extern declarations for test funcs */ ··· 730 678 731 679 save_netns(); 732 680 stdio_hijack(); 681 + env.has_testmod = true; 682 + if (load_bpf_testmod()) { 683 + fprintf(env.stderr, "WARNING! Selftests relying on bpf_testmod.ko will be skipped.\n"); 684 + env.has_testmod = false; 685 + } 733 686 for (i = 0; i < prog_test_cnt; i++) { 734 687 struct prog_test_def *test = &prog_test_defs[i]; 735 688 ··· 779 722 if (test->need_cgroup_cleanup) 780 723 cleanup_cgroup_environment(); 781 724 } 725 + if (env.has_testmod) 726 + unload_bpf_testmod(); 782 727 stdio_restore(); 783 728 784 729 if (env.get_test_cnt) {

+1

tools/testing/selftests/bpf/test_progs.h

··· 66 66 enum verbosity verbosity; 67 67 68 68 bool jit_enabled; 69 + bool has_testmod; 69 70 bool get_test_cnt; 70 71 bool list_test_names; 71 72

+12 -184

tools/testing/selftests/bpf/test_sock_addr.c

··· 31 31 #define CONNECT6_PROG_PATH "./connect6_prog.o" 32 32 #define SENDMSG4_PROG_PATH "./sendmsg4_prog.o" 33 33 #define SENDMSG6_PROG_PATH "./sendmsg6_prog.o" 34 + #define BIND4_PROG_PATH "./bind4_prog.o" 35 + #define BIND6_PROG_PATH "./bind6_prog.o" 34 36 35 37 #define SERV4_IP "192.168.1.254" 36 38 #define SERV4_REWRITE_IP "127.0.0.1" ··· 662 660 return ret; 663 661 } 664 662 665 - /* [1] These testing programs try to read different context fields, including 666 - * narrow loads of different sizes from user_ip4 and user_ip6, and write to 667 - * those allowed to be overridden. 668 - * 669 - * [2] BPF_LD_IMM64 & BPF_JMP_REG are used below whenever there is a need to 670 - * compare a register with unsigned 32bit integer. BPF_JMP_IMM can't be used 671 - * in such cases since it accepts only _signed_ 32bit integer as IMM 672 - * argument. Also note that BPF_LD_IMM64 contains 2 instructions what matters 673 - * to count jumps properly. 674 - */ 675 - 676 - static int bind4_prog_load(const struct sock_addr_test *test) 677 - { 678 - union { 679 - uint8_t u4_addr8[4]; 680 - uint16_t u4_addr16[2]; 681 - uint32_t u4_addr32; 682 - } ip4, port; 683 - struct sockaddr_in addr4_rw; 684 - 685 - if (inet_pton(AF_INET, SERV4_IP, (void *)&ip4) != 1) { 686 - log_err("Invalid IPv4: %s", SERV4_IP); 687 - return -1; 688 - } 689 - 690 - port.u4_addr32 = htons(SERV4_PORT); 691 - 692 - if (mk_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, 693 - (struct sockaddr *)&addr4_rw, sizeof(addr4_rw)) == -1) 694 - return -1; 695 - 696 - /* See [1]. */ 697 - struct bpf_insn insns[] = { 698 - BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 699 - 700 - /* if (sk.family == AF_INET && */ 701 - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, 702 - offsetof(struct bpf_sock_addr, family)), 703 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 32), 704 - 705 - /* (sk.type == SOCK_DGRAM || sk.type == SOCK_STREAM) && */ 706 - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, 707 - offsetof(struct bpf_sock_addr, type)), 708 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_DGRAM, 1), 709 - BPF_JMP_A(1), 710 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_STREAM, 28), 711 - 712 - /* 1st_byte_of_user_ip4 == expected && */ 713 - BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, 714 - offsetof(struct bpf_sock_addr, user_ip4)), 715 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[0], 26), 716 - 717 - /* 2nd_byte_of_user_ip4 == expected && */ 718 - BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, 719 - offsetof(struct bpf_sock_addr, user_ip4) + 1), 720 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[1], 24), 721 - 722 - /* 3rd_byte_of_user_ip4 == expected && */ 723 - BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, 724 - offsetof(struct bpf_sock_addr, user_ip4) + 2), 725 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[2], 22), 726 - 727 - /* 4th_byte_of_user_ip4 == expected && */ 728 - BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, 729 - offsetof(struct bpf_sock_addr, user_ip4) + 3), 730 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[3], 20), 731 - 732 - /* 1st_half_of_user_ip4 == expected && */ 733 - BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, 734 - offsetof(struct bpf_sock_addr, user_ip4)), 735 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[0], 18), 736 - 737 - /* 2nd_half_of_user_ip4 == expected && */ 738 - BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, 739 - offsetof(struct bpf_sock_addr, user_ip4) + 2), 740 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[1], 16), 741 - 742 - /* whole_user_ip4 == expected && */ 743 - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, 744 - offsetof(struct bpf_sock_addr, user_ip4)), 745 - BPF_LD_IMM64(BPF_REG_8, ip4.u4_addr32), /* See [2]. */ 746 - BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 12), 747 - 748 - /* 1st_byte_of_user_port == expected && */ 749 - BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, 750 - offsetof(struct bpf_sock_addr, user_port)), 751 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, port.u4_addr8[0], 10), 752 - 753 - /* 1st_half_of_user_port == expected && */ 754 - BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, 755 - offsetof(struct bpf_sock_addr, user_port)), 756 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, port.u4_addr16[0], 8), 757 - 758 - /* user_port == expected) { */ 759 - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, 760 - offsetof(struct bpf_sock_addr, user_port)), 761 - BPF_LD_IMM64(BPF_REG_8, port.u4_addr32), /* See [2]. */ 762 - BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 4), 763 - 764 - /* user_ip4 = addr4_rw.sin_addr */ 765 - BPF_MOV32_IMM(BPF_REG_7, addr4_rw.sin_addr.s_addr), 766 - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, 767 - offsetof(struct bpf_sock_addr, user_ip4)), 768 - 769 - /* user_port = addr4_rw.sin_port */ 770 - BPF_MOV32_IMM(BPF_REG_7, addr4_rw.sin_port), 771 - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, 772 - offsetof(struct bpf_sock_addr, user_port)), 773 - /* } */ 774 - 775 - /* return 1 */ 776 - BPF_MOV64_IMM(BPF_REG_0, 1), 777 - BPF_EXIT_INSN(), 778 - }; 779 - 780 - return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn)); 781 - } 782 - 783 - static int bind6_prog_load(const struct sock_addr_test *test) 784 - { 785 - struct sockaddr_in6 addr6_rw; 786 - struct in6_addr ip6; 787 - 788 - if (inet_pton(AF_INET6, SERV6_IP, (void *)&ip6) != 1) { 789 - log_err("Invalid IPv6: %s", SERV6_IP); 790 - return -1; 791 - } 792 - 793 - if (mk_sockaddr(AF_INET6, SERV6_REWRITE_IP, SERV6_REWRITE_PORT, 794 - (struct sockaddr *)&addr6_rw, sizeof(addr6_rw)) == -1) 795 - return -1; 796 - 797 - /* See [1]. */ 798 - struct bpf_insn insns[] = { 799 - BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 800 - 801 - /* if (sk.family == AF_INET6 && */ 802 - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, 803 - offsetof(struct bpf_sock_addr, family)), 804 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET6, 18), 805 - 806 - /* 5th_byte_of_user_ip6 == expected && */ 807 - BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, 808 - offsetof(struct bpf_sock_addr, user_ip6[1])), 809 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip6.s6_addr[4], 16), 810 - 811 - /* 3rd_half_of_user_ip6 == expected && */ 812 - BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, 813 - offsetof(struct bpf_sock_addr, user_ip6[1])), 814 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip6.s6_addr16[2], 14), 815 - 816 - /* last_word_of_user_ip6 == expected) { */ 817 - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, 818 - offsetof(struct bpf_sock_addr, user_ip6[3])), 819 - BPF_LD_IMM64(BPF_REG_8, ip6.s6_addr32[3]), /* See [2]. */ 820 - BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 10), 821 - 822 - 823 - #define STORE_IPV6_WORD(N) \ 824 - BPF_MOV32_IMM(BPF_REG_7, addr6_rw.sin6_addr.s6_addr32[N]), \ 825 - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, \ 826 - offsetof(struct bpf_sock_addr, user_ip6[N])) 827 - 828 - /* user_ip6 = addr6_rw.sin6_addr */ 829 - STORE_IPV6_WORD(0), 830 - STORE_IPV6_WORD(1), 831 - STORE_IPV6_WORD(2), 832 - STORE_IPV6_WORD(3), 833 - 834 - /* user_port = addr6_rw.sin6_port */ 835 - BPF_MOV32_IMM(BPF_REG_7, addr6_rw.sin6_port), 836 - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, 837 - offsetof(struct bpf_sock_addr, user_port)), 838 - 839 - /* } */ 840 - 841 - /* return 1 */ 842 - BPF_MOV64_IMM(BPF_REG_0, 1), 843 - BPF_EXIT_INSN(), 844 - }; 845 - 846 - return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn)); 847 - } 848 - 849 663 static int load_path(const struct sock_addr_test *test, const char *path) 850 664 { 851 665 struct bpf_prog_load_attr attr; ··· 681 863 } 682 864 683 865 return prog_fd; 866 + } 867 + 868 + static int bind4_prog_load(const struct sock_addr_test *test) 869 + { 870 + return load_path(test, BIND4_PROG_PATH); 871 + } 872 + 873 + static int bind6_prog_load(const struct sock_addr_test *test) 874 + { 875 + return load_path(test, BIND6_PROG_PATH); 684 876 } 685 877 686 878 static int connect4_prog_load(const struct sock_addr_test *test)

+23 -13

tools/testing/selftests/bpf/test_sockmap.c

··· 1273 1273 return "unknown"; 1274 1274 } 1275 1275 1276 + static void append_str(char *dst, const char *src, size_t dst_cap) 1277 + { 1278 + size_t avail = dst_cap - strlen(dst); 1279 + 1280 + if (avail <= 1) /* just zero byte could be written */ 1281 + return; 1282 + 1283 + strncat(dst, src, avail - 1); /* strncat() adds + 1 for zero byte */ 1284 + } 1285 + 1276 1286 #define OPTSTRING 60 1277 1287 static void test_options(char *options) 1278 1288 { ··· 1291 1281 memset(options, 0, OPTSTRING); 1292 1282 1293 1283 if (txmsg_pass) 1294 - strncat(options, "pass,", OPTSTRING); 1284 + append_str(options, "pass,", OPTSTRING); 1295 1285 if (txmsg_redir) 1296 - strncat(options, "redir,", OPTSTRING); 1286 + append_str(options, "redir,", OPTSTRING); 1297 1287 if (txmsg_drop) 1298 - strncat(options, "drop,", OPTSTRING); 1288 + append_str(options, "drop,", OPTSTRING); 1299 1289 if (txmsg_apply) { 1300 1290 snprintf(tstr, OPTSTRING, "apply %d,", txmsg_apply); 1301 - strncat(options, tstr, OPTSTRING); 1291 + append_str(options, tstr, OPTSTRING); 1302 1292 } 1303 1293 if (txmsg_cork) { 1304 1294 snprintf(tstr, OPTSTRING, "cork %d,", txmsg_cork); 1305 - strncat(options, tstr, OPTSTRING); 1295 + append_str(options, tstr, OPTSTRING); 1306 1296 } 1307 1297 if (txmsg_start) { 1308 1298 snprintf(tstr, OPTSTRING, "start %d,", txmsg_start); 1309 - strncat(options, tstr, OPTSTRING); 1299 + append_str(options, tstr, OPTSTRING); 1310 1300 } 1311 1301 if (txmsg_end) { 1312 1302 snprintf(tstr, OPTSTRING, "end %d,", txmsg_end); 1313 - strncat(options, tstr, OPTSTRING); 1303 + append_str(options, tstr, OPTSTRING); 1314 1304 } 1315 1305 if (txmsg_start_pop) { 1316 1306 snprintf(tstr, OPTSTRING, "pop (%d,%d),", 1317 1307 txmsg_start_pop, txmsg_start_pop + txmsg_pop); 1318 - strncat(options, tstr, OPTSTRING); 1308 + append_str(options, tstr, OPTSTRING); 1319 1309 } 1320 1310 if (txmsg_ingress) 1321 - strncat(options, "ingress,", OPTSTRING); 1311 + append_str(options, "ingress,", OPTSTRING); 1322 1312 if (txmsg_redir_skb) 1323 - strncat(options, "redir_skb,", OPTSTRING); 1313 + append_str(options, "redir_skb,", OPTSTRING); 1324 1314 if (txmsg_ktls_skb) 1325 - strncat(options, "ktls_skb,", OPTSTRING); 1315 + append_str(options, "ktls_skb,", OPTSTRING); 1326 1316 if (ktls) 1327 - strncat(options, "ktls,", OPTSTRING); 1317 + append_str(options, "ktls,", OPTSTRING); 1328 1318 if (peek_flag) 1329 - strncat(options, "peek,", OPTSTRING); 1319 + append_str(options, "peek,", OPTSTRING); 1330 1320 } 1331 1321 1332 1322 static int __test_exec(int cgrp, int test, struct sockmap_options *opt)

+2

tools/testing/selftests/bpf/test_tcpbpf.h

··· 16 16 __u32 num_close_events; 17 17 __u32 tcp_save_syn; 18 18 __u32 tcp_saved_syn; 19 + __u32 window_clamp_client; 20 + __u32 window_clamp_server; 19 21 }; 20 22 #endif

+13

tools/testing/selftests/bpf/test_verifier.c

··· 1152 1152 1153 1153 static bool test_as_unpriv(struct bpf_test *test) 1154 1154 { 1155 + #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 1156 + /* Some architectures have strict alignment requirements. In 1157 + * that case, the BPF verifier detects if a program has 1158 + * unaligned accesses and rejects them. A user can pass 1159 + * BPF_F_ANY_ALIGNMENT to a program to override this 1160 + * check. That, however, will only work when a privileged user 1161 + * loads a program. An unprivileged user loading a program 1162 + * with this flag will be rejected prior entering the 1163 + * verifier. 1164 + */ 1165 + if (test->flags & F_NEEDS_EFFICIENT_UNALIGNED_ACCESS) 1166 + return false; 1167 + #endif 1155 1168 return !test->prog_type || 1156 1169 test->prog_type == BPF_PROG_TYPE_SOCKET_FILTER || 1157 1170 test->prog_type == BPF_PROG_TYPE_CGROUP_SKB;

+7

tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c

··· 266 266 .result = REJECT, 267 267 .prog_type = BPF_PROG_TYPE_SK_LOOKUP, 268 268 .expected_attach_type = BPF_SK_LOOKUP, 269 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 269 270 }, 270 271 { 271 272 "invalid 8-byte read from bpf_sk_lookup remote_ip4 field", ··· 293 292 .result = REJECT, 294 293 .prog_type = BPF_PROG_TYPE_SK_LOOKUP, 295 294 .expected_attach_type = BPF_SK_LOOKUP, 295 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 296 296 }, 297 297 { 298 298 "invalid 8-byte read from bpf_sk_lookup remote_port field", ··· 307 305 .result = REJECT, 308 306 .prog_type = BPF_PROG_TYPE_SK_LOOKUP, 309 307 .expected_attach_type = BPF_SK_LOOKUP, 308 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 310 309 }, 311 310 { 312 311 "invalid 8-byte read from bpf_sk_lookup local_ip4 field", ··· 334 331 .result = REJECT, 335 332 .prog_type = BPF_PROG_TYPE_SK_LOOKUP, 336 333 .expected_attach_type = BPF_SK_LOOKUP, 334 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 337 335 }, 338 336 { 339 337 "invalid 8-byte read from bpf_sk_lookup local_port field", ··· 348 344 .result = REJECT, 349 345 .prog_type = BPF_PROG_TYPE_SK_LOOKUP, 350 346 .expected_attach_type = BPF_SK_LOOKUP, 347 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 351 348 }, 352 349 /* invalid 1,2,4-byte reads from 8-byte fields in bpf_sk_lookup */ 353 350 { ··· 415 410 .result = REJECT, 416 411 .prog_type = BPF_PROG_TYPE_SK_LOOKUP, 417 412 .expected_attach_type = BPF_SK_LOOKUP, 413 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 418 414 }, 419 415 { 420 416 "invalid 4-byte unaligned read from bpf_sk_lookup at even offset", ··· 428 422 .result = REJECT, 429 423 .prog_type = BPF_PROG_TYPE_SK_LOOKUP, 430 424 .expected_attach_type = BPF_SK_LOOKUP, 425 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 431 426 }, 432 427 /* in-bound and out-of-bound writes to bpf_sk_lookup */ 433 428 {

+3

tools/testing/selftests/bpf/verifier/direct_value_access.c

··· 69 69 .fixup_map_array_48b = { 1 }, 70 70 .result = REJECT, 71 71 .errstr = "R1 min value is outside of the allowed memory range", 72 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 72 73 }, 73 74 { 74 75 "direct map access, write test 7", ··· 196 195 .fixup_map_array_48b = { 1, 3 }, 197 196 .result = REJECT, 198 197 .errstr = "invalid access to map value, value_size=48 off=47 size=2", 198 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 199 199 }, 200 200 { 201 201 "direct map access, write test 17", ··· 211 209 .fixup_map_array_48b = { 1, 3 }, 212 210 .result = REJECT, 213 211 .errstr = "invalid access to map value, value_size=48 off=47 size=2", 212 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 214 213 }, 215 214 { 216 215 "direct map access, write test 18",

+1

tools/testing/selftests/bpf/verifier/map_ptr.c

··· 44 44 .errstr_unpriv = "bpf_array access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN", 45 45 .result = REJECT, 46 46 .errstr = "cannot access ptr member ops with moff 0 in struct bpf_map with off 1 size 4", 47 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 47 48 }, 48 49 { 49 50 "bpf_map_ptr: read ops field accepted",

+1

tools/testing/selftests/bpf/verifier/raw_tp_writable.c

··· 31 31 .fixup_map_hash_8b = { 1, }, 32 32 .prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, 33 33 .errstr = "R6 invalid variable buffer offset: off=0, var_off=(0x0; 0xffffffff)", 34 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 34 35 },

+4

tools/testing/selftests/bpf/verifier/ref_tracking.c

··· 675 675 .prog_type = BPF_PROG_TYPE_SCHED_CLS, 676 676 .result = REJECT, 677 677 .errstr = "invalid mem access", 678 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 678 679 }, 679 680 { 680 681 "reference tracking: use ptr from bpf_sk_fullsock() after release", ··· 699 698 .prog_type = BPF_PROG_TYPE_SCHED_CLS, 700 699 .result = REJECT, 701 700 .errstr = "invalid mem access", 701 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 702 702 }, 703 703 { 704 704 "reference tracking: use ptr from bpf_sk_fullsock(tp) after release", ··· 727 725 .prog_type = BPF_PROG_TYPE_SCHED_CLS, 728 726 .result = REJECT, 729 727 .errstr = "invalid mem access", 728 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 730 729 }, 731 730 { 732 731 "reference tracking: use sk after bpf_sk_release(tp)", ··· 750 747 .prog_type = BPF_PROG_TYPE_SCHED_CLS, 751 748 .result = REJECT, 752 749 .errstr = "invalid mem access", 750 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 753 751 }, 754 752 { 755 753 "reference tracking: use ptr from bpf_get_listener_sock() after bpf_sk_release(sk)",

+8

tools/testing/selftests/bpf/verifier/regalloc.c

··· 21 21 .fixup_map_hash_48b = { 4 }, 22 22 .result = ACCEPT, 23 23 .prog_type = BPF_PROG_TYPE_TRACEPOINT, 24 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 24 25 }, 25 26 { 26 27 "regalloc negative", ··· 72 71 .fixup_map_hash_48b = { 4 }, 73 72 .result = ACCEPT, 74 73 .prog_type = BPF_PROG_TYPE_TRACEPOINT, 74 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 75 75 }, 76 76 { 77 77 "regalloc src_reg negative", ··· 99 97 .result = REJECT, 100 98 .errstr = "invalid access to map value, value_size=48 off=44 size=8", 101 99 .prog_type = BPF_PROG_TYPE_TRACEPOINT, 100 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 102 101 }, 103 102 { 104 103 "regalloc and spill", ··· 129 126 .fixup_map_hash_48b = { 4 }, 130 127 .result = ACCEPT, 131 128 .prog_type = BPF_PROG_TYPE_TRACEPOINT, 129 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 132 130 }, 133 131 { 134 132 "regalloc and spill negative", ··· 160 156 .result = REJECT, 161 157 .errstr = "invalid access to map value, value_size=48 off=48 size=8", 162 158 .prog_type = BPF_PROG_TYPE_TRACEPOINT, 159 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 163 160 }, 164 161 { 165 162 "regalloc three regs", ··· 187 182 .fixup_map_hash_48b = { 4 }, 188 183 .result = ACCEPT, 189 184 .prog_type = BPF_PROG_TYPE_TRACEPOINT, 185 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 190 186 }, 191 187 { 192 188 "regalloc after call", ··· 216 210 .fixup_map_hash_48b = { 4 }, 217 211 .result = ACCEPT, 218 212 .prog_type = BPF_PROG_TYPE_TRACEPOINT, 213 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 219 214 }, 220 215 { 221 216 "regalloc in callee", ··· 247 240 .fixup_map_hash_48b = { 4 }, 248 241 .result = ACCEPT, 249 242 .prog_type = BPF_PROG_TYPE_TRACEPOINT, 243 + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 250 244 }, 251 245 { 252 246 "regalloc, spill, JEQ",

+28 -18

tools/testing/selftests/bpf/verifier/wide_access.c

··· 1 - #define BPF_SOCK_ADDR_STORE(field, off, res, err) \ 1 + #define BPF_SOCK_ADDR_STORE(field, off, res, err, flgs) \ 2 2 { \ 3 3 "wide store to bpf_sock_addr." #field "[" #off "]", \ 4 4 .insns = { \ ··· 11 11 .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, \ 12 12 .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, \ 13 13 .errstr = err, \ 14 + .flags = flgs, \ 14 15 } 15 16 16 17 /* user_ip6[0] is u64 aligned */ 17 18 BPF_SOCK_ADDR_STORE(user_ip6, 0, ACCEPT, 18 - NULL), 19 + NULL, 0), 19 20 BPF_SOCK_ADDR_STORE(user_ip6, 1, REJECT, 20 - "invalid bpf_context access off=12 size=8"), 21 + "invalid bpf_context access off=12 size=8", 22 + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), 21 23 BPF_SOCK_ADDR_STORE(user_ip6, 2, ACCEPT, 22 - NULL), 24 + NULL, 0), 23 25 BPF_SOCK_ADDR_STORE(user_ip6, 3, REJECT, 24 - "invalid bpf_context access off=20 size=8"), 26 + "invalid bpf_context access off=20 size=8", 27 + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), 25 28 26 29 /* msg_src_ip6[0] is _not_ u64 aligned */ 27 30 BPF_SOCK_ADDR_STORE(msg_src_ip6, 0, REJECT, 28 - "invalid bpf_context access off=44 size=8"), 31 + "invalid bpf_context access off=44 size=8", 32 + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), 29 33 BPF_SOCK_ADDR_STORE(msg_src_ip6, 1, ACCEPT, 30 - NULL), 34 + NULL, 0), 31 35 BPF_SOCK_ADDR_STORE(msg_src_ip6, 2, REJECT, 32 - "invalid bpf_context access off=52 size=8"), 36 + "invalid bpf_context access off=52 size=8", 37 + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), 33 38 BPF_SOCK_ADDR_STORE(msg_src_ip6, 3, REJECT, 34 - "invalid bpf_context access off=56 size=8"), 39 + "invalid bpf_context access off=56 size=8", 0), 35 40 36 41 #undef BPF_SOCK_ADDR_STORE 37 42 38 - #define BPF_SOCK_ADDR_LOAD(field, off, res, err) \ 43 + #define BPF_SOCK_ADDR_LOAD(field, off, res, err, flgs) \ 39 44 { \ 40 45 "wide load from bpf_sock_addr." #field "[" #off "]", \ 41 46 .insns = { \ ··· 53 48 .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, \ 54 49 .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, \ 55 50 .errstr = err, \ 51 + .flags = flgs, \ 56 52 } 57 53 58 54 /* user_ip6[0] is u64 aligned */ 59 55 BPF_SOCK_ADDR_LOAD(user_ip6, 0, ACCEPT, 60 - NULL), 56 + NULL, 0), 61 57 BPF_SOCK_ADDR_LOAD(user_ip6, 1, REJECT, 62 - "invalid bpf_context access off=12 size=8"), 58 + "invalid bpf_context access off=12 size=8", 59 + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), 63 60 BPF_SOCK_ADDR_LOAD(user_ip6, 2, ACCEPT, 64 - NULL), 61 + NULL, 0), 65 62 BPF_SOCK_ADDR_LOAD(user_ip6, 3, REJECT, 66 - "invalid bpf_context access off=20 size=8"), 63 + "invalid bpf_context access off=20 size=8", 64 + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), 67 65 68 66 /* msg_src_ip6[0] is _not_ u64 aligned */ 69 67 BPF_SOCK_ADDR_LOAD(msg_src_ip6, 0, REJECT, 70 - "invalid bpf_context access off=44 size=8"), 68 + "invalid bpf_context access off=44 size=8", 69 + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), 71 70 BPF_SOCK_ADDR_LOAD(msg_src_ip6, 1, ACCEPT, 72 - NULL), 71 + NULL, 0), 73 72 BPF_SOCK_ADDR_LOAD(msg_src_ip6, 2, REJECT, 74 - "invalid bpf_context access off=52 size=8"), 73 + "invalid bpf_context access off=52 size=8", 74 + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), 75 75 BPF_SOCK_ADDR_LOAD(msg_src_ip6, 3, REJECT, 76 - "invalid bpf_context access off=56 size=8"), 76 + "invalid bpf_context access off=56 size=8", 0), 77 77 78 78 #undef BPF_SOCK_ADDR_LOAD