Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'Address-XDP-frags-having-negative-tailroom'

Larysa Zaremba says:

====================
Address XDP frags having negative tailroom

Aside from the issue described below, tailroom calculation does not account
for pages being split between frags, e.g. in i40e, enetc and
AF_XDP ZC with smaller chunks. These series address the problem by
calculating modulo (skb_frag_off() % rxq->frag_size) in order to get
data offset within a smaller block of memory. Please note, xskxceiver
tail grow test passes without modulo e.g. in xdpdrv mode on i40e,
because there is not enough descriptors to get to flipped buffers.

Many ethernet drivers report xdp Rx queue frag size as being the same as
DMA write size. However, the only user of this field, namely
bpf_xdp_frags_increase_tail(), clearly expects a truesize.

Such difference leads to unspecific memory corruption issues under certain
circumstances, e.g. in ixgbevf maximum DMA write size is 3 KB, so when
running xskxceiver's XDP_ADJUST_TAIL_GROW_MULTI_BUFF, 6K packet fully uses
all DMA-writable space in 2 buffers. This would be fine, if only
rxq->frag_size was properly set to 4K, but value of 3K results in a
negative tailroom, because there is a non-zero page offset.

We are supposed to return -EINVAL and be done with it in such case,
but due to tailroom being stored as an unsigned int, it is reported to be
somewhere near UINT_MAX, resulting in a tail being grown, even if the
requested offset is too much(it is around 2K in the abovementioned test).
This later leads to all kinds of unspecific calltraces.

[ 7340.337579] xskxceiver[1440]: segfault at 1da718 ip 00007f4161aeac9d sp 00007f41615a6a00 error 6
[ 7340.338040] xskxceiver[1441]: segfault at 7f410000000b ip 00000000004042b5 sp 00007f415bffecf0 error 4
[ 7340.338179] in libc.so.6[61c9d,7f4161aaf000+160000]
[ 7340.339230] in xskxceiver[42b5,400000+69000]
[ 7340.340300] likely on CPU 6 (core 0, socket 6)
[ 7340.340302] Code: ff ff 01 e9 f4 fe ff ff 0f 1f 44 00 00 4c 39 f0 74 73 31 c0 ba 01 00 00 00 f0 0f b1 17 0f 85 ba 00 00 00 49 8b 87 88 00 00 00 <4c> 89 70 08 eb cc 0f 1f 44 00 00 48 8d bd f0 fe ff ff 89 85 ec fe
[ 7340.340888] likely on CPU 3 (core 0, socket 3)
[ 7340.345088] Code: 00 00 00 ba 00 00 00 00 be 00 00 00 00 89 c7 e8 31 ca ff ff 89 45 ec 8b 45 ec 85 c0 78 07 b8 00 00 00 00 eb 46 e8 0b c8 ff ff <8b> 00 83 f8 69 74 24 e8 ff c7 ff ff 8b 00 83 f8 0b 74 18 e8 f3 c7
[ 7340.404334] Oops: general protection fault, probably for non-canonical address 0x6d255010bdffc: 0000 [#1] SMP NOPTI
[ 7340.405972] CPU: 7 UID: 0 PID: 1439 Comm: xskxceiver Not tainted 6.19.0-rc1+ #21 PREEMPT(lazy)
[ 7340.408006] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.17.0-5.fc42 04/01/2014
[ 7340.409716] RIP: 0010:lookup_swap_cgroup_id+0x44/0x80
[ 7340.410455] Code: 83 f8 1c 73 39 48 ba ff ff ff ff ff ff ff 03 48 8b 04 c5 20 55 fa bd 48 21 d1 48 89 ca 83 e1 01 48 d1 ea c1 e1 04 48 8d 04 90 <8b> 00 48 83 c4 10 d3 e8 c3 cc cc cc cc 31 c0 e9 98 b7 dd 00 48 89
[ 7340.412787] RSP: 0018:ffffcc5c04f7f6d0 EFLAGS: 00010202
[ 7340.413494] RAX: 0006d255010bdffc RBX: ffff891f477895a8 RCX: 0000000000000010
[ 7340.414431] RDX: 0001c17e3fffffff RSI: 00fa070000000000 RDI: 000382fc7fffffff
[ 7340.415354] RBP: 00fa070000000000 R08: ffffcc5c04f7f8f8 R09: ffffcc5c04f7f7d0
[ 7340.416283] R10: ffff891f4c1a7000 R11: ffffcc5c04f7f9c8 R12: ffffcc5c04f7f7d0
[ 7340.417218] R13: 03ffffffffffffff R14: 00fa06fffffffe00 R15: ffff891f47789500
[ 7340.418229] FS: 0000000000000000(0000) GS:ffff891ffdfaa000(0000) knlGS:0000000000000000
[ 7340.419489] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 7340.420286] CR2: 00007f415bfffd58 CR3: 0000000103f03002 CR4: 0000000000772ef0
[ 7340.421237] PKRU: 55555554
[ 7340.421623] Call Trace:
[ 7340.421987] <TASK>
[ 7340.422309] ? softleaf_from_pte+0x77/0xa0
[ 7340.422855] swap_pte_batch+0xa7/0x290
[ 7340.423363] zap_nonpresent_ptes.constprop.0.isra.0+0xd1/0x270
[ 7340.424102] zap_pte_range+0x281/0x580
[ 7340.424607] zap_pmd_range.isra.0+0xc9/0x240
[ 7340.425177] unmap_page_range+0x24d/0x420
[ 7340.425714] unmap_vmas+0xa1/0x180
[ 7340.426185] exit_mmap+0xe1/0x3b0
[ 7340.426644] __mmput+0x41/0x150
[ 7340.427098] exit_mm+0xb1/0x110
[ 7340.427539] do_exit+0x1b2/0x460
[ 7340.427992] do_group_exit+0x2d/0xc0
[ 7340.428477] get_signal+0x79d/0x7e0
[ 7340.428957] arch_do_signal_or_restart+0x34/0x100
[ 7340.429571] exit_to_user_mode_loop+0x8e/0x4c0
[ 7340.430159] do_syscall_64+0x188/0x6b0
[ 7340.430672] ? __do_sys_clone3+0xd9/0x120
[ 7340.431212] ? switch_fpu_return+0x4e/0xd0
[ 7340.431761] ? arch_exit_to_user_mode_prepare.isra.0+0xa1/0xc0
[ 7340.432498] ? do_syscall_64+0xbb/0x6b0
[ 7340.433015] ? __handle_mm_fault+0x445/0x690
[ 7340.433582] ? count_memcg_events+0xd6/0x210
[ 7340.434151] ? handle_mm_fault+0x212/0x340
[ 7340.434697] ? do_user_addr_fault+0x2b4/0x7b0
[ 7340.435271] ? clear_bhb_loop+0x30/0x80
[ 7340.435788] ? clear_bhb_loop+0x30/0x80
[ 7340.436299] ? clear_bhb_loop+0x30/0x80
[ 7340.436812] ? clear_bhb_loop+0x30/0x80
[ 7340.437323] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ 7340.437973] RIP: 0033:0x7f4161b14169
[ 7340.438468] Code: Unable to access opcode bytes at 0x7f4161b1413f.
[ 7340.439242] RSP: 002b:00007ffc6ebfa770 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca
[ 7340.440173] RAX: fffffffffffffe00 RBX: 00000000000005a1 RCX: 00007f4161b14169
[ 7340.441061] RDX: 00000000000005a1 RSI: 0000000000000109 RDI: 00007f415bfff990
[ 7340.441943] RBP: 00007ffc6ebfa7a0 R08: 0000000000000000 R09: 00000000ffffffff
[ 7340.442824] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
[ 7340.443707] R13: 0000000000000000 R14: 00007f415bfff990 R15: 00007f415bfff6c0
[ 7340.444586] </TASK>
[ 7340.444922] Modules linked in: rfkill intel_rapl_msr intel_rapl_common intel_uncore_frequency_common skx_edac_common nfit libnvdimm kvm_intel vfat fat kvm snd_pcm irqbypass rapl iTCO_wdt snd_timer intel_pmc_bxt iTCO_vendor_support snd ixgbevf virtio_net soundcore i2c_i801 pcspkr libeth_xdp net_failover i2c_smbus lpc_ich failover libeth virtio_balloon joydev 9p fuse loop zram lz4hc_compress lz4_compress 9pnet_virtio 9pnet netfs ghash_clmulni_intel serio_raw qemu_fw_cfg
[ 7340.449650] ---[ end trace 0000000000000000 ]---

The issue can be fixed in all in-tree drivers, but we cannot just trust OOT
drivers to not do this. Therefore, make tailroom a signed int and produce a
warning when it is negative to prevent such mistakes in the future.

The issue can also be easily reproduced with ice driver, by applying
the following diff to xskxceiver and enjoying a kernel panic in xdpdrv mode:

diff --git a/tools/testing/selftests/bpf/prog_tests/test_xsk.c b/tools/testing/selftests/bpf/prog_tests/test_xsk.c
index 5af28f359cfd..042d587fa7ef 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_xsk.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_xsk.c
@@ -2541,8 +2541,8 @@ int testapp_adjust_tail_grow_mb(struct test_spec *test)
{
test->mtu = MAX_ETH_JUMBO_SIZE;
/* Grow by (frag_size - last_frag_Size) - 1 to stay inside the last fragment */
- return testapp_adjust_tail(test, (XSK_UMEM__MAX_FRAME_SIZE / 2) - 1,
- XSK_UMEM__LARGE_FRAME_SIZE * 2);
+ return testapp_adjust_tail(test, XSK_UMEM__MAX_FRAME_SIZE * 100,
+ 6912);
}

int testapp_tx_queue_consumer(struct test_spec *test)

If we print out the values involved in the tailroom calculation:

tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag);

4294967040 = 3456 - 3456 - 256

I personally reproduced and verified the issue in ice and i40e,
aside from WiP ixgbevf implementation.
====================

Link: https://patch.msgid.link/20260305111253.2317394-1-larysa.zaremba@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+69 -47
+1 -1
drivers/net/ethernet/freescale/enetc/enetc.c
··· 3467 3467 priv->rx_ring[i] = bdr; 3468 3468 3469 3469 err = __xdp_rxq_info_reg(&bdr->xdp.rxq, priv->ndev, i, 0, 3470 - ENETC_RXB_DMA_SIZE_XDP); 3470 + ENETC_RXB_TRUESIZE); 3471 3471 if (err) 3472 3472 goto free_vector; 3473 3473
+24 -17
drivers/net/ethernet/intel/i40e/i40e_main.c
··· 3569 3569 u16 pf_q = vsi->base_queue + ring->queue_index; 3570 3570 struct i40e_hw *hw = &vsi->back->hw; 3571 3571 struct i40e_hmc_obj_rxq rx_ctx; 3572 + u32 xdp_frame_sz; 3572 3573 int err = 0; 3573 3574 bool ok; 3574 3575 ··· 3579 3578 memset(&rx_ctx, 0, sizeof(rx_ctx)); 3580 3579 3581 3580 ring->rx_buf_len = vsi->rx_buf_len; 3581 + xdp_frame_sz = i40e_rx_pg_size(ring) / 2; 3582 3582 3583 3583 /* XDP RX-queue info only needed for RX rings exposed to XDP */ 3584 3584 if (ring->vsi->type != I40E_VSI_MAIN) 3585 3585 goto skip; 3586 3586 3587 - if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) { 3588 - err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, 3589 - ring->queue_index, 3590 - ring->q_vector->napi.napi_id, 3591 - ring->rx_buf_len); 3592 - if (err) 3593 - return err; 3594 - } 3595 - 3596 3587 ring->xsk_pool = i40e_xsk_pool(ring); 3597 3588 if (ring->xsk_pool) { 3598 - xdp_rxq_info_unreg(&ring->xdp_rxq); 3589 + xdp_frame_sz = xsk_pool_get_rx_frag_step(ring->xsk_pool); 3599 3590 ring->rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool); 3600 3591 err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, 3601 3592 ring->queue_index, 3602 3593 ring->q_vector->napi.napi_id, 3603 - ring->rx_buf_len); 3594 + xdp_frame_sz); 3604 3595 if (err) 3605 3596 return err; 3606 3597 err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, 3607 3598 MEM_TYPE_XSK_BUFF_POOL, 3608 3599 NULL); 3609 3600 if (err) 3610 - return err; 3601 + goto unreg_xdp; 3611 3602 dev_info(&vsi->back->pdev->dev, 3612 3603 "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n", 3613 3604 ring->queue_index); 3614 3605 3615 3606 } else { 3607 + err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, 3608 + ring->queue_index, 3609 + ring->q_vector->napi.napi_id, 3610 + xdp_frame_sz); 3611 + if (err) 3612 + return err; 3616 3613 err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, 3617 3614 MEM_TYPE_PAGE_SHARED, 3618 3615 NULL); 3619 3616 if (err) 3620 - return err; 3617 + goto unreg_xdp; 3621 3618 } 3622 3619 3623 3620 skip: 3624 - xdp_init_buff(&ring->xdp, i40e_rx_pg_size(ring) / 2, &ring->xdp_rxq); 3621 + xdp_init_buff(&ring->xdp, xdp_frame_sz, &ring->xdp_rxq); 3625 3622 3626 3623 rx_ctx.dbuff = DIV_ROUND_UP(ring->rx_buf_len, 3627 3624 BIT_ULL(I40E_RXQ_CTX_DBUFF_SHIFT)); ··· 3653 3654 dev_info(&vsi->back->pdev->dev, 3654 3655 "Failed to clear LAN Rx queue context on Rx ring %d (pf_q %d), error: %d\n", 3655 3656 ring->queue_index, pf_q, err); 3656 - return -ENOMEM; 3657 + err = -ENOMEM; 3658 + goto unreg_xdp; 3657 3659 } 3658 3660 3659 3661 /* set the context in the HMC */ ··· 3663 3663 dev_info(&vsi->back->pdev->dev, 3664 3664 "Failed to set LAN Rx queue context on Rx ring %d (pf_q %d), error: %d\n", 3665 3665 ring->queue_index, pf_q, err); 3666 - return -ENOMEM; 3666 + err = -ENOMEM; 3667 + goto unreg_xdp; 3667 3668 } 3668 3669 3669 3670 /* configure Rx buffer alignment */ ··· 3672 3671 if (I40E_2K_TOO_SMALL_WITH_PADDING) { 3673 3672 dev_info(&vsi->back->pdev->dev, 3674 3673 "2k Rx buffer is too small to fit standard MTU and skb_shared_info\n"); 3675 - return -EOPNOTSUPP; 3674 + err = -EOPNOTSUPP; 3675 + goto unreg_xdp; 3676 3676 } 3677 3677 clear_ring_build_skb_enabled(ring); 3678 3678 } else { ··· 3703 3701 } 3704 3702 3705 3703 return 0; 3704 + unreg_xdp: 3705 + if (ring->vsi->type == I40E_VSI_MAIN) 3706 + xdp_rxq_info_unreg(&ring->xdp_rxq); 3707 + 3708 + return err; 3706 3709 } 3707 3710 3708 3711 /**
+3 -2
drivers/net/ethernet/intel/i40e/i40e_txrx.c
··· 1470 1470 if (!rx_ring->rx_bi) 1471 1471 return; 1472 1472 1473 + if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) 1474 + xdp_rxq_info_unreg(&rx_ring->xdp_rxq); 1475 + 1473 1476 if (rx_ring->xsk_pool) { 1474 1477 i40e_xsk_clean_rx_ring(rx_ring); 1475 1478 goto skip_free; ··· 1530 1527 void i40e_free_rx_resources(struct i40e_ring *rx_ring) 1531 1528 { 1532 1529 i40e_clean_rx_ring(rx_ring); 1533 - if (rx_ring->vsi->type == I40E_VSI_MAIN) 1534 - xdp_rxq_info_unreg(&rx_ring->xdp_rxq); 1535 1530 rx_ring->xdp_prog = NULL; 1536 1531 kfree(rx_ring->rx_bi); 1537 1532 rx_ring->rx_bi = NULL;
+10 -23
drivers/net/ethernet/intel/ice/ice_base.c
··· 661 661 { 662 662 struct device *dev = ice_pf_to_dev(ring->vsi->back); 663 663 u32 num_bufs = ICE_DESC_UNUSED(ring); 664 - u32 rx_buf_len; 665 664 int err; 666 665 667 666 if (ring->vsi->type == ICE_VSI_PF || ring->vsi->type == ICE_VSI_SF || 668 667 ring->vsi->type == ICE_VSI_LB) { 669 - if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) { 670 - err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, 671 - ring->q_index, 672 - ring->q_vector->napi.napi_id, 673 - ring->rx_buf_len); 674 - if (err) 675 - return err; 676 - } 677 - 678 668 ice_rx_xsk_pool(ring); 679 669 err = ice_realloc_rx_xdp_bufs(ring, ring->xsk_pool); 680 670 if (err) 681 671 return err; 682 672 683 673 if (ring->xsk_pool) { 684 - xdp_rxq_info_unreg(&ring->xdp_rxq); 685 - 686 - rx_buf_len = 687 - xsk_pool_get_rx_frame_size(ring->xsk_pool); 674 + u32 frag_size = 675 + xsk_pool_get_rx_frag_step(ring->xsk_pool); 688 676 err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, 689 677 ring->q_index, 690 678 ring->q_vector->napi.napi_id, 691 - rx_buf_len); 679 + frag_size); 692 680 if (err) 693 681 return err; 694 682 err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, ··· 693 705 if (err) 694 706 return err; 695 707 696 - if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) { 697 - err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, 698 - ring->q_index, 699 - ring->q_vector->napi.napi_id, 700 - ring->rx_buf_len); 701 - if (err) 702 - goto err_destroy_fq; 703 - } 708 + err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, 709 + ring->q_index, 710 + ring->q_vector->napi.napi_id, 711 + ring->truesize); 712 + if (err) 713 + goto err_destroy_fq; 714 + 704 715 xdp_rxq_info_attach_page_pool(&ring->xdp_rxq, 705 716 ring->pp); 706 717 }
+1
drivers/net/ethernet/intel/ice/ice_ethtool.c
··· 3342 3342 rx_rings[i].cached_phctime = pf->ptp.cached_phc_time; 3343 3343 rx_rings[i].desc = NULL; 3344 3344 rx_rings[i].xdp_buf = NULL; 3345 + rx_rings[i].xdp_rxq = (struct xdp_rxq_info){ }; 3345 3346 3346 3347 /* this is to allow wr32 to have something to write to 3347 3348 * during early allocation of Rx buffers
+3 -1
drivers/net/ethernet/intel/ice/ice_txrx.c
··· 560 560 i = 0; 561 561 } 562 562 563 - if (rx_ring->vsi->type == ICE_VSI_PF && 563 + if ((rx_ring->vsi->type == ICE_VSI_PF || 564 + rx_ring->vsi->type == ICE_VSI_SF || 565 + rx_ring->vsi->type == ICE_VSI_LB) && 564 566 xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) { 565 567 xdp_rxq_info_detach_mem_model(&rx_ring->xdp_rxq); 566 568 xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
+3
drivers/net/ethernet/intel/ice/ice_xsk.c
··· 899 899 u16 ntc = rx_ring->next_to_clean; 900 900 u16 ntu = rx_ring->next_to_use; 901 901 902 + if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) 903 + xdp_rxq_info_unreg(&rx_ring->xdp_rxq); 904 + 902 905 while (ntc != ntu) { 903 906 struct xdp_buff *xdp = *ice_xdp_buf(rx_ring, ntc); 904 907
+5 -1
drivers/net/ethernet/intel/idpf/xdp.c
··· 47 47 { 48 48 const struct idpf_vport *vport = rxq->q_vector->vport; 49 49 const struct idpf_q_vec_rsrc *rsrc; 50 + u32 frag_size = 0; 50 51 bool split; 51 52 int err; 52 53 54 + if (idpf_queue_has(XSK, rxq)) 55 + frag_size = rxq->bufq_sets[0].bufq.truesize; 56 + 53 57 err = __xdp_rxq_info_reg(&rxq->xdp_rxq, vport->netdev, rxq->idx, 54 58 rxq->q_vector->napi.napi_id, 55 - rxq->rx_buf_size); 59 + frag_size); 56 60 if (err) 57 61 return err; 58 62
+1
drivers/net/ethernet/intel/idpf/xsk.c
··· 403 403 bufq->pending = fq.pending; 404 404 bufq->thresh = fq.thresh; 405 405 bufq->rx_buf_size = fq.buf_len; 406 + bufq->truesize = fq.truesize; 406 407 407 408 if (!idpf_xskfq_refill(bufq)) 408 409 netdev_err(bufq->pool->netdev,
+1
drivers/net/ethernet/intel/libeth/xsk.c
··· 167 167 fq->pending = fq->count; 168 168 fq->thresh = libeth_xdp_queue_threshold(fq->count); 169 169 fq->buf_len = xsk_pool_get_rx_frame_size(fq->pool); 170 + fq->truesize = xsk_pool_get_rx_frag_step(fq->pool); 170 171 171 172 return 0; 172 173 }
+3
include/net/libeth/xsk.h
··· 597 597 * @pending: current number of XSkFQEs to refill 598 598 * @thresh: threshold below which the queue is refilled 599 599 * @buf_len: HW-writeable length per each buffer 600 + * @truesize: step between consecutive buffers, 0 if none exists 600 601 * @nid: ID of the closest NUMA node with memory 601 602 */ 602 603 struct libeth_xskfq { ··· 615 614 u32 thresh; 616 615 617 616 u32 buf_len; 617 + u32 truesize; 618 + 618 619 int nid; 619 620 }; 620 621
+10
include/net/xdp_sock_drv.h
··· 51 51 return xsk_pool_get_chunk_size(pool) - xsk_pool_get_headroom(pool); 52 52 } 53 53 54 + static inline u32 xsk_pool_get_rx_frag_step(struct xsk_buff_pool *pool) 55 + { 56 + return pool->unaligned ? 0 : xsk_pool_get_chunk_size(pool); 57 + } 58 + 54 59 static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, 55 60 struct xdp_rxq_info *rxq) 56 61 { ··· 338 333 } 339 334 340 335 static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) 336 + { 337 + return 0; 338 + } 339 + 340 + static inline u32 xsk_pool_get_rx_frag_step(struct xsk_buff_pool *pool) 341 341 { 342 342 return 0; 343 343 }
+4 -2
net/core/filter.c
··· 4150 4150 struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); 4151 4151 skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1]; 4152 4152 struct xdp_rxq_info *rxq = xdp->rxq; 4153 - unsigned int tailroom; 4153 + int tailroom; 4154 4154 4155 4155 if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz) 4156 4156 return -EOPNOTSUPP; 4157 4157 4158 - tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag); 4158 + tailroom = rxq->frag_size - skb_frag_size(frag) - 4159 + skb_frag_off(frag) % rxq->frag_size; 4160 + WARN_ON_ONCE(tailroom < 0); 4159 4161 if (unlikely(offset > tailroom)) 4160 4162 return -EINVAL; 4161 4163