Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'Address-XDP-frags-having-negative-tailroom'

Larysa Zaremba says:

====================
Address XDP frags having negative tailroom

Aside from the issue described below, tailroom calculation does not account
for pages being split between frags, e.g. in i40e, enetc and
AF_XDP ZC with smaller chunks. These series address the problem by
calculating modulo (skb_frag_off() % rxq->frag_size) in order to get
data offset within a smaller block of memory. Please note, xskxceiver
tail grow test passes without modulo e.g. in xdpdrv mode on i40e,
because there is not enough descriptors to get to flipped buffers.

Many ethernet drivers report xdp Rx queue frag size as being the same as
DMA write size. However, the only user of this field, namely
bpf_xdp_frags_increase_tail(), clearly expects a truesize.

Such difference leads to unspecific memory corruption issues under certain
circumstances, e.g. in ixgbevf maximum DMA write size is 3 KB, so when
running xskxceiver's XDP_ADJUST_TAIL_GROW_MULTI_BUFF, 6K packet fully uses
all DMA-writable space in 2 buffers. This would be fine, if only
rxq->frag_size was properly set to 4K, but value of 3K results in a
negative tailroom, because there is a non-zero page offset.

We are supposed to return -EINVAL and be done with it in such case,
but due to tailroom being stored as an unsigned int, it is reported to be
somewhere near UINT_MAX, resulting in a tail being grown, even if the
requested offset is too much(it is around 2K in the abovementioned test).
This later leads to all kinds of unspecific calltraces.

[ 7340.337579] xskxceiver[1440]: segfault at 1da718 ip 00007f4161aeac9d sp 00007f41615a6a00 error 6
[ 7340.338040] xskxceiver[1441]: segfault at 7f410000000b ip 00000000004042b5 sp 00007f415bffecf0 error 4
[ 7340.338179] in libc.so.6[61c9d,7f4161aaf000+160000]
[ 7340.339230] in xskxceiver[42b5,400000+69000]
[ 7340.340300] likely on CPU 6 (core 0, socket 6)
[ 7340.340302] Code: ff ff 01 e9 f4 fe ff ff 0f 1f 44 00 00 4c 39 f0 74 73 31 c0 ba 01 00 00 00 f0 0f b1 17 0f 85 ba 00 00 00 49 8b 87 88 00 00 00 <4c> 89 70 08 eb cc 0f 1f 44 00 00 48 8d bd f0 fe ff ff 89 85 ec fe
[ 7340.340888] likely on CPU 3 (core 0, socket 3)
[ 7340.345088] Code: 00 00 00 ba 00 00 00 00 be 00 00 00 00 89 c7 e8 31 ca ff ff 89 45 ec 8b 45 ec 85 c0 78 07 b8 00 00 00 00 eb 46 e8 0b c8 ff ff <8b> 00 83 f8 69 74 24 e8 ff c7 ff ff 8b 00 83 f8 0b 74 18 e8 f3 c7
[ 7340.404334] Oops: general protection fault, probably for non-canonical address 0x6d255010bdffc: 0000 [#1] SMP NOPTI
[ 7340.405972] CPU: 7 UID: 0 PID: 1439 Comm: xskxceiver Not tainted 6.19.0-rc1+ #21 PREEMPT(lazy)
[ 7340.408006] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.17.0-5.fc42 04/01/2014
[ 7340.409716] RIP: 0010:lookup_swap_cgroup_id+0x44/0x80
[ 7340.410455] Code: 83 f8 1c 73 39 48 ba ff ff ff ff ff ff ff 03 48 8b 04 c5 20 55 fa bd 48 21 d1 48 89 ca 83 e1 01 48 d1 ea c1 e1 04 48 8d 04 90 <8b> 00 48 83 c4 10 d3 e8 c3 cc cc cc cc 31 c0 e9 98 b7 dd 00 48 89
[ 7340.412787] RSP: 0018:ffffcc5c04f7f6d0 EFLAGS: 00010202
[ 7340.413494] RAX: 0006d255010bdffc RBX: ffff891f477895a8 RCX: 0000000000000010
[ 7340.414431] RDX: 0001c17e3fffffff RSI: 00fa070000000000 RDI: 000382fc7fffffff
[ 7340.415354] RBP: 00fa070000000000 R08: ffffcc5c04f7f8f8 R09: ffffcc5c04f7f7d0
[ 7340.416283] R10: ffff891f4c1a7000 R11: ffffcc5c04f7f9c8 R12: ffffcc5c04f7f7d0
[ 7340.417218] R13: 03ffffffffffffff R14: 00fa06fffffffe00 R15: ffff891f47789500
[ 7340.418229] FS: 0000000000000000(0000) GS:ffff891ffdfaa000(0000) knlGS:0000000000000000
[ 7340.419489] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 7340.420286] CR2: 00007f415bfffd58 CR3: 0000000103f03002 CR4: 0000000000772ef0
[ 7340.421237] PKRU: 55555554
[ 7340.421623] Call Trace:
[ 7340.421987] <TASK>
[ 7340.422309] ? softleaf_from_pte+0x77/0xa0
[ 7340.422855] swap_pte_batch+0xa7/0x290
[ 7340.423363] zap_nonpresent_ptes.constprop.0.isra.0+0xd1/0x270
[ 7340.424102] zap_pte_range+0x281/0x580
[ 7340.424607] zap_pmd_range.isra.0+0xc9/0x240
[ 7340.425177] unmap_page_range+0x24d/0x420
[ 7340.425714] unmap_vmas+0xa1/0x180
[ 7340.426185] exit_mmap+0xe1/0x3b0
[ 7340.426644] __mmput+0x41/0x150
[ 7340.427098] exit_mm+0xb1/0x110
[ 7340.427539] do_exit+0x1b2/0x460
[ 7340.427992] do_group_exit+0x2d/0xc0
[ 7340.428477] get_signal+0x79d/0x7e0
[ 7340.428957] arch_do_signal_or_restart+0x34/0x100
[ 7340.429571] exit_to_user_mode_loop+0x8e/0x4c0
[ 7340.430159] do_syscall_64+0x188/0x6b0
[ 7340.430672] ? __do_sys_clone3+0xd9/0x120
[ 7340.431212] ? switch_fpu_return+0x4e/0xd0
[ 7340.431761] ? arch_exit_to_user_mode_prepare.isra.0+0xa1/0xc0
[ 7340.432498] ? do_syscall_64+0xbb/0x6b0
[ 7340.433015] ? __handle_mm_fault+0x445/0x690
[ 7340.433582] ? count_memcg_events+0xd6/0x210
[ 7340.434151] ? handle_mm_fault+0x212/0x340
[ 7340.434697] ? do_user_addr_fault+0x2b4/0x7b0
[ 7340.435271] ? clear_bhb_loop+0x30/0x80
[ 7340.435788] ? clear_bhb_loop+0x30/0x80
[ 7340.436299] ? clear_bhb_loop+0x30/0x80
[ 7340.436812] ? clear_bhb_loop+0x30/0x80
[ 7340.437323] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ 7340.437973] RIP: 0033:0x7f4161b14169
[ 7340.438468] Code: Unable to access opcode bytes at 0x7f4161b1413f.
[ 7340.439242] RSP: 002b:00007ffc6ebfa770 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca
[ 7340.440173] RAX: fffffffffffffe00 RBX: 00000000000005a1 RCX: 00007f4161b14169
[ 7340.441061] RDX: 00000000000005a1 RSI: 0000000000000109 RDI: 00007f415bfff990
[ 7340.441943] RBP: 00007ffc6ebfa7a0 R08: 0000000000000000 R09: 00000000ffffffff
[ 7340.442824] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
[ 7340.443707] R13: 0000000000000000 R14: 00007f415bfff990 R15: 00007f415bfff6c0
[ 7340.444586] </TASK>
[ 7340.444922] Modules linked in: rfkill intel_rapl_msr intel_rapl_common intel_uncore_frequency_common skx_edac_common nfit libnvdimm kvm_intel vfat fat kvm snd_pcm irqbypass rapl iTCO_wdt snd_timer intel_pmc_bxt iTCO_vendor_support snd ixgbevf virtio_net soundcore i2c_i801 pcspkr libeth_xdp net_failover i2c_smbus lpc_ich failover libeth virtio_balloon joydev 9p fuse loop zram lz4hc_compress lz4_compress 9pnet_virtio 9pnet netfs ghash_clmulni_intel serio_raw qemu_fw_cfg
[ 7340.449650] ---[ end trace 0000000000000000 ]---

The issue can be fixed in all in-tree drivers, but we cannot just trust OOT
drivers to not do this. Therefore, make tailroom a signed int and produce a
warning when it is negative to prevent such mistakes in the future.

The issue can also be easily reproduced with ice driver, by applying
the following diff to xskxceiver and enjoying a kernel panic in xdpdrv mode:

diff --git a/tools/testing/selftests/bpf/prog_tests/test_xsk.c b/tools/testing/selftests/bpf/prog_tests/test_xsk.c
index 5af28f359cfd..042d587fa7ef 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_xsk.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_xsk.c
@@ -2541,8 +2541,8 @@ int testapp_adjust_tail_grow_mb(struct test_spec *test)
{
test->mtu = MAX_ETH_JUMBO_SIZE;
/* Grow by (frag_size - last_frag_Size) - 1 to stay inside the last fragment */
- return testapp_adjust_tail(test, (XSK_UMEM__MAX_FRAME_SIZE / 2) - 1,
- XSK_UMEM__LARGE_FRAME_SIZE * 2);
+ return testapp_adjust_tail(test, XSK_UMEM__MAX_FRAME_SIZE * 100,
+ 6912);
}

int testapp_tx_queue_consumer(struct test_spec *test)

If we print out the values involved in the tailroom calculation:

tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag);

4294967040 = 3456 - 3456 - 256

I personally reproduced and verified the issue in ice and i40e,
aside from WiP ixgbevf implementation.
====================

Link: https://patch.msgid.link/20260305111253.2317394-1-larysa.zaremba@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+69 -47
+1 -1
drivers/net/ethernet/freescale/enetc/enetc.c
··· 3467 priv->rx_ring[i] = bdr; 3468 3469 err = __xdp_rxq_info_reg(&bdr->xdp.rxq, priv->ndev, i, 0, 3470 - ENETC_RXB_DMA_SIZE_XDP); 3471 if (err) 3472 goto free_vector; 3473
··· 3467 priv->rx_ring[i] = bdr; 3468 3469 err = __xdp_rxq_info_reg(&bdr->xdp.rxq, priv->ndev, i, 0, 3470 + ENETC_RXB_TRUESIZE); 3471 if (err) 3472 goto free_vector; 3473
+24 -17
drivers/net/ethernet/intel/i40e/i40e_main.c
··· 3569 u16 pf_q = vsi->base_queue + ring->queue_index; 3570 struct i40e_hw *hw = &vsi->back->hw; 3571 struct i40e_hmc_obj_rxq rx_ctx; 3572 int err = 0; 3573 bool ok; 3574 ··· 3579 memset(&rx_ctx, 0, sizeof(rx_ctx)); 3580 3581 ring->rx_buf_len = vsi->rx_buf_len; 3582 3583 /* XDP RX-queue info only needed for RX rings exposed to XDP */ 3584 if (ring->vsi->type != I40E_VSI_MAIN) 3585 goto skip; 3586 3587 - if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) { 3588 - err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, 3589 - ring->queue_index, 3590 - ring->q_vector->napi.napi_id, 3591 - ring->rx_buf_len); 3592 - if (err) 3593 - return err; 3594 - } 3595 - 3596 ring->xsk_pool = i40e_xsk_pool(ring); 3597 if (ring->xsk_pool) { 3598 - xdp_rxq_info_unreg(&ring->xdp_rxq); 3599 ring->rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool); 3600 err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, 3601 ring->queue_index, 3602 ring->q_vector->napi.napi_id, 3603 - ring->rx_buf_len); 3604 if (err) 3605 return err; 3606 err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, 3607 MEM_TYPE_XSK_BUFF_POOL, 3608 NULL); 3609 if (err) 3610 - return err; 3611 dev_info(&vsi->back->pdev->dev, 3612 "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n", 3613 ring->queue_index); 3614 3615 } else { 3616 err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, 3617 MEM_TYPE_PAGE_SHARED, 3618 NULL); 3619 if (err) 3620 - return err; 3621 } 3622 3623 skip: 3624 - xdp_init_buff(&ring->xdp, i40e_rx_pg_size(ring) / 2, &ring->xdp_rxq); 3625 3626 rx_ctx.dbuff = DIV_ROUND_UP(ring->rx_buf_len, 3627 BIT_ULL(I40E_RXQ_CTX_DBUFF_SHIFT)); ··· 3653 dev_info(&vsi->back->pdev->dev, 3654 "Failed to clear LAN Rx queue context on Rx ring %d (pf_q %d), error: %d\n", 3655 ring->queue_index, pf_q, err); 3656 - return -ENOMEM; 3657 } 3658 3659 /* set the context in the HMC */ ··· 3663 dev_info(&vsi->back->pdev->dev, 3664 "Failed to set LAN Rx queue context on Rx ring %d (pf_q %d), error: %d\n", 3665 ring->queue_index, pf_q, err); 3666 - return -ENOMEM; 3667 } 3668 3669 /* configure Rx buffer alignment */ ··· 3672 if (I40E_2K_TOO_SMALL_WITH_PADDING) { 3673 dev_info(&vsi->back->pdev->dev, 3674 "2k Rx buffer is too small to fit standard MTU and skb_shared_info\n"); 3675 - return -EOPNOTSUPP; 3676 } 3677 clear_ring_build_skb_enabled(ring); 3678 } else { ··· 3703 } 3704 3705 return 0; 3706 } 3707 3708 /**
··· 3569 u16 pf_q = vsi->base_queue + ring->queue_index; 3570 struct i40e_hw *hw = &vsi->back->hw; 3571 struct i40e_hmc_obj_rxq rx_ctx; 3572 + u32 xdp_frame_sz; 3573 int err = 0; 3574 bool ok; 3575 ··· 3578 memset(&rx_ctx, 0, sizeof(rx_ctx)); 3579 3580 ring->rx_buf_len = vsi->rx_buf_len; 3581 + xdp_frame_sz = i40e_rx_pg_size(ring) / 2; 3582 3583 /* XDP RX-queue info only needed for RX rings exposed to XDP */ 3584 if (ring->vsi->type != I40E_VSI_MAIN) 3585 goto skip; 3586 3587 ring->xsk_pool = i40e_xsk_pool(ring); 3588 if (ring->xsk_pool) { 3589 + xdp_frame_sz = xsk_pool_get_rx_frag_step(ring->xsk_pool); 3590 ring->rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool); 3591 err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, 3592 ring->queue_index, 3593 ring->q_vector->napi.napi_id, 3594 + xdp_frame_sz); 3595 if (err) 3596 return err; 3597 err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, 3598 MEM_TYPE_XSK_BUFF_POOL, 3599 NULL); 3600 if (err) 3601 + goto unreg_xdp; 3602 dev_info(&vsi->back->pdev->dev, 3603 "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n", 3604 ring->queue_index); 3605 3606 } else { 3607 + err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, 3608 + ring->queue_index, 3609 + ring->q_vector->napi.napi_id, 3610 + xdp_frame_sz); 3611 + if (err) 3612 + return err; 3613 err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, 3614 MEM_TYPE_PAGE_SHARED, 3615 NULL); 3616 if (err) 3617 + goto unreg_xdp; 3618 } 3619 3620 skip: 3621 + xdp_init_buff(&ring->xdp, xdp_frame_sz, &ring->xdp_rxq); 3622 3623 rx_ctx.dbuff = DIV_ROUND_UP(ring->rx_buf_len, 3624 BIT_ULL(I40E_RXQ_CTX_DBUFF_SHIFT)); ··· 3654 dev_info(&vsi->back->pdev->dev, 3655 "Failed to clear LAN Rx queue context on Rx ring %d (pf_q %d), error: %d\n", 3656 ring->queue_index, pf_q, err); 3657 + err = -ENOMEM; 3658 + goto unreg_xdp; 3659 } 3660 3661 /* set the context in the HMC */ ··· 3663 dev_info(&vsi->back->pdev->dev, 3664 "Failed to set LAN Rx queue context on Rx ring %d (pf_q %d), error: %d\n", 3665 ring->queue_index, pf_q, err); 3666 + err = -ENOMEM; 3667 + goto unreg_xdp; 3668 } 3669 3670 /* configure Rx buffer alignment */ ··· 3671 if (I40E_2K_TOO_SMALL_WITH_PADDING) { 3672 dev_info(&vsi->back->pdev->dev, 3673 "2k Rx buffer is too small to fit standard MTU and skb_shared_info\n"); 3674 + err = -EOPNOTSUPP; 3675 + goto unreg_xdp; 3676 } 3677 clear_ring_build_skb_enabled(ring); 3678 } else { ··· 3701 } 3702 3703 return 0; 3704 + unreg_xdp: 3705 + if (ring->vsi->type == I40E_VSI_MAIN) 3706 + xdp_rxq_info_unreg(&ring->xdp_rxq); 3707 + 3708 + return err; 3709 } 3710 3711 /**
+3 -2
drivers/net/ethernet/intel/i40e/i40e_txrx.c
··· 1470 if (!rx_ring->rx_bi) 1471 return; 1472 1473 if (rx_ring->xsk_pool) { 1474 i40e_xsk_clean_rx_ring(rx_ring); 1475 goto skip_free; ··· 1530 void i40e_free_rx_resources(struct i40e_ring *rx_ring) 1531 { 1532 i40e_clean_rx_ring(rx_ring); 1533 - if (rx_ring->vsi->type == I40E_VSI_MAIN) 1534 - xdp_rxq_info_unreg(&rx_ring->xdp_rxq); 1535 rx_ring->xdp_prog = NULL; 1536 kfree(rx_ring->rx_bi); 1537 rx_ring->rx_bi = NULL;
··· 1470 if (!rx_ring->rx_bi) 1471 return; 1472 1473 + if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) 1474 + xdp_rxq_info_unreg(&rx_ring->xdp_rxq); 1475 + 1476 if (rx_ring->xsk_pool) { 1477 i40e_xsk_clean_rx_ring(rx_ring); 1478 goto skip_free; ··· 1527 void i40e_free_rx_resources(struct i40e_ring *rx_ring) 1528 { 1529 i40e_clean_rx_ring(rx_ring); 1530 rx_ring->xdp_prog = NULL; 1531 kfree(rx_ring->rx_bi); 1532 rx_ring->rx_bi = NULL;
+10 -23
drivers/net/ethernet/intel/ice/ice_base.c
··· 661 { 662 struct device *dev = ice_pf_to_dev(ring->vsi->back); 663 u32 num_bufs = ICE_DESC_UNUSED(ring); 664 - u32 rx_buf_len; 665 int err; 666 667 if (ring->vsi->type == ICE_VSI_PF || ring->vsi->type == ICE_VSI_SF || 668 ring->vsi->type == ICE_VSI_LB) { 669 - if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) { 670 - err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, 671 - ring->q_index, 672 - ring->q_vector->napi.napi_id, 673 - ring->rx_buf_len); 674 - if (err) 675 - return err; 676 - } 677 - 678 ice_rx_xsk_pool(ring); 679 err = ice_realloc_rx_xdp_bufs(ring, ring->xsk_pool); 680 if (err) 681 return err; 682 683 if (ring->xsk_pool) { 684 - xdp_rxq_info_unreg(&ring->xdp_rxq); 685 - 686 - rx_buf_len = 687 - xsk_pool_get_rx_frame_size(ring->xsk_pool); 688 err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, 689 ring->q_index, 690 ring->q_vector->napi.napi_id, 691 - rx_buf_len); 692 if (err) 693 return err; 694 err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, ··· 693 if (err) 694 return err; 695 696 - if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) { 697 - err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, 698 - ring->q_index, 699 - ring->q_vector->napi.napi_id, 700 - ring->rx_buf_len); 701 - if (err) 702 - goto err_destroy_fq; 703 - } 704 xdp_rxq_info_attach_page_pool(&ring->xdp_rxq, 705 ring->pp); 706 }
··· 661 { 662 struct device *dev = ice_pf_to_dev(ring->vsi->back); 663 u32 num_bufs = ICE_DESC_UNUSED(ring); 664 int err; 665 666 if (ring->vsi->type == ICE_VSI_PF || ring->vsi->type == ICE_VSI_SF || 667 ring->vsi->type == ICE_VSI_LB) { 668 ice_rx_xsk_pool(ring); 669 err = ice_realloc_rx_xdp_bufs(ring, ring->xsk_pool); 670 if (err) 671 return err; 672 673 if (ring->xsk_pool) { 674 + u32 frag_size = 675 + xsk_pool_get_rx_frag_step(ring->xsk_pool); 676 err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, 677 ring->q_index, 678 ring->q_vector->napi.napi_id, 679 + frag_size); 680 if (err) 681 return err; 682 err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, ··· 705 if (err) 706 return err; 707 708 + err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, 709 + ring->q_index, 710 + ring->q_vector->napi.napi_id, 711 + ring->truesize); 712 + if (err) 713 + goto err_destroy_fq; 714 + 715 xdp_rxq_info_attach_page_pool(&ring->xdp_rxq, 716 ring->pp); 717 }
+1
drivers/net/ethernet/intel/ice/ice_ethtool.c
··· 3342 rx_rings[i].cached_phctime = pf->ptp.cached_phc_time; 3343 rx_rings[i].desc = NULL; 3344 rx_rings[i].xdp_buf = NULL; 3345 3346 /* this is to allow wr32 to have something to write to 3347 * during early allocation of Rx buffers
··· 3342 rx_rings[i].cached_phctime = pf->ptp.cached_phc_time; 3343 rx_rings[i].desc = NULL; 3344 rx_rings[i].xdp_buf = NULL; 3345 + rx_rings[i].xdp_rxq = (struct xdp_rxq_info){ }; 3346 3347 /* this is to allow wr32 to have something to write to 3348 * during early allocation of Rx buffers
+3 -1
drivers/net/ethernet/intel/ice/ice_txrx.c
··· 560 i = 0; 561 } 562 563 - if (rx_ring->vsi->type == ICE_VSI_PF && 564 xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) { 565 xdp_rxq_info_detach_mem_model(&rx_ring->xdp_rxq); 566 xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
··· 560 i = 0; 561 } 562 563 + if ((rx_ring->vsi->type == ICE_VSI_PF || 564 + rx_ring->vsi->type == ICE_VSI_SF || 565 + rx_ring->vsi->type == ICE_VSI_LB) && 566 xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) { 567 xdp_rxq_info_detach_mem_model(&rx_ring->xdp_rxq); 568 xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
+3
drivers/net/ethernet/intel/ice/ice_xsk.c
··· 899 u16 ntc = rx_ring->next_to_clean; 900 u16 ntu = rx_ring->next_to_use; 901 902 while (ntc != ntu) { 903 struct xdp_buff *xdp = *ice_xdp_buf(rx_ring, ntc); 904
··· 899 u16 ntc = rx_ring->next_to_clean; 900 u16 ntu = rx_ring->next_to_use; 901 902 + if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) 903 + xdp_rxq_info_unreg(&rx_ring->xdp_rxq); 904 + 905 while (ntc != ntu) { 906 struct xdp_buff *xdp = *ice_xdp_buf(rx_ring, ntc); 907
+5 -1
drivers/net/ethernet/intel/idpf/xdp.c
··· 47 { 48 const struct idpf_vport *vport = rxq->q_vector->vport; 49 const struct idpf_q_vec_rsrc *rsrc; 50 bool split; 51 int err; 52 53 err = __xdp_rxq_info_reg(&rxq->xdp_rxq, vport->netdev, rxq->idx, 54 rxq->q_vector->napi.napi_id, 55 - rxq->rx_buf_size); 56 if (err) 57 return err; 58
··· 47 { 48 const struct idpf_vport *vport = rxq->q_vector->vport; 49 const struct idpf_q_vec_rsrc *rsrc; 50 + u32 frag_size = 0; 51 bool split; 52 int err; 53 54 + if (idpf_queue_has(XSK, rxq)) 55 + frag_size = rxq->bufq_sets[0].bufq.truesize; 56 + 57 err = __xdp_rxq_info_reg(&rxq->xdp_rxq, vport->netdev, rxq->idx, 58 rxq->q_vector->napi.napi_id, 59 + frag_size); 60 if (err) 61 return err; 62
+1
drivers/net/ethernet/intel/idpf/xsk.c
··· 403 bufq->pending = fq.pending; 404 bufq->thresh = fq.thresh; 405 bufq->rx_buf_size = fq.buf_len; 406 407 if (!idpf_xskfq_refill(bufq)) 408 netdev_err(bufq->pool->netdev,
··· 403 bufq->pending = fq.pending; 404 bufq->thresh = fq.thresh; 405 bufq->rx_buf_size = fq.buf_len; 406 + bufq->truesize = fq.truesize; 407 408 if (!idpf_xskfq_refill(bufq)) 409 netdev_err(bufq->pool->netdev,
+1
drivers/net/ethernet/intel/libeth/xsk.c
··· 167 fq->pending = fq->count; 168 fq->thresh = libeth_xdp_queue_threshold(fq->count); 169 fq->buf_len = xsk_pool_get_rx_frame_size(fq->pool); 170 171 return 0; 172 }
··· 167 fq->pending = fq->count; 168 fq->thresh = libeth_xdp_queue_threshold(fq->count); 169 fq->buf_len = xsk_pool_get_rx_frame_size(fq->pool); 170 + fq->truesize = xsk_pool_get_rx_frag_step(fq->pool); 171 172 return 0; 173 }
+3
include/net/libeth/xsk.h
··· 597 * @pending: current number of XSkFQEs to refill 598 * @thresh: threshold below which the queue is refilled 599 * @buf_len: HW-writeable length per each buffer 600 * @nid: ID of the closest NUMA node with memory 601 */ 602 struct libeth_xskfq { ··· 615 u32 thresh; 616 617 u32 buf_len; 618 int nid; 619 }; 620
··· 597 * @pending: current number of XSkFQEs to refill 598 * @thresh: threshold below which the queue is refilled 599 * @buf_len: HW-writeable length per each buffer 600 + * @truesize: step between consecutive buffers, 0 if none exists 601 * @nid: ID of the closest NUMA node with memory 602 */ 603 struct libeth_xskfq { ··· 614 u32 thresh; 615 616 u32 buf_len; 617 + u32 truesize; 618 + 619 int nid; 620 }; 621
+10
include/net/xdp_sock_drv.h
··· 51 return xsk_pool_get_chunk_size(pool) - xsk_pool_get_headroom(pool); 52 } 53 54 static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, 55 struct xdp_rxq_info *rxq) 56 { ··· 338 } 339 340 static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) 341 { 342 return 0; 343 }
··· 51 return xsk_pool_get_chunk_size(pool) - xsk_pool_get_headroom(pool); 52 } 53 54 + static inline u32 xsk_pool_get_rx_frag_step(struct xsk_buff_pool *pool) 55 + { 56 + return pool->unaligned ? 0 : xsk_pool_get_chunk_size(pool); 57 + } 58 + 59 static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, 60 struct xdp_rxq_info *rxq) 61 { ··· 333 } 334 335 static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) 336 + { 337 + return 0; 338 + } 339 + 340 + static inline u32 xsk_pool_get_rx_frag_step(struct xsk_buff_pool *pool) 341 { 342 return 0; 343 }
+4 -2
net/core/filter.c
··· 4150 struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); 4151 skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1]; 4152 struct xdp_rxq_info *rxq = xdp->rxq; 4153 - unsigned int tailroom; 4154 4155 if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz) 4156 return -EOPNOTSUPP; 4157 4158 - tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag); 4159 if (unlikely(offset > tailroom)) 4160 return -EINVAL; 4161
··· 4150 struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); 4151 skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1]; 4152 struct xdp_rxq_info *rxq = xdp->rxq; 4153 + int tailroom; 4154 4155 if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz) 4156 return -EOPNOTSUPP; 4157 4158 + tailroom = rxq->frag_size - skb_frag_size(frag) - 4159 + skb_frag_off(frag) % rxq->frag_size; 4160 + WARN_ON_ONCE(tailroom < 0); 4161 if (unlikely(offset > tailroom)) 4162 return -EINVAL; 4163