Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: af_packet: Use hrtimer to do the retire operation

In a system with high real-time requirements, the timeout mechanism of
ordinary timers with jiffies granularity is insufficient to meet the
demands for real-time performance. Meanwhile, the optimization of CPU
usage with af_packet is quite significant. Use hrtimer instead of timer
to help compensate for the shortcomings in real-time performance.
In HZ=100 or HZ=250 system, the update of TP_STATUS_USER is not real-time
enough, with fluctuations reaching over 8ms (on a system with HZ=250).
This is unacceptable in some high real-time systems that require timely
processing of network packets. By replacing it with hrtimer, if a timeout
of 2ms is set, the update of TP_STATUS_USER can be stabilized to within
3 ms.

Delete delete_blk_timer field, because hrtimer_cancel will check and wait
until the timer callback return and ensure never enter callback again.

Simplify the logic related to setting timeout, only update the hrtimer
expire time within the hrtimer callback, no longer update the expire time
in prb_open_block which is called by tpacket_rcv or timer callback.
Reasons why NOT update hrtimer in prb_open_block:
1) It will increase complexity to distinguish the two caller scenario.
2) hrtimer_cancel and hrtimer_start need to be called if you want to update
TMO of an already enqueued hrtimer, leading to complex shutdown logic.

One side effect of NOT update hrtimer when called by tpacket_rcv is that
a newly opened block triggered by tpacket_rcv may be retired earlier than
expected. On the other hand, if timeout is updated in prb_open_block, the
frequent reception of network packets that leads to prb_open_block being
called may cause hrtimer to be removed and enqueued repeatedly.

The retire hrtimer expiration is unconditional and periodic. If there are
numerous packet sockets on the system, please set an appropriate timeout
to avoid frequent enqueueing of hrtimers.

Reviewed-by: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://lore.kernel.org/all/20250831100822.1238795-1-jackzxcui1989@163.com/
Signed-off-by: Xin Zhao <jackzxcui1989@163.com>
Link: https://patch.msgid.link/20250908104549.204412-3-jackzxcui1989@163.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Xin Zhao and committed by
Jakub Kicinski
f7460d29 28d2420d

+33 -83
+27 -77
net/packet/af_packet.c
··· 203 203 static int prb_queue_frozen(struct tpacket_kbdq_core *); 204 204 static void prb_open_block(struct tpacket_kbdq_core *, 205 205 struct tpacket_block_desc *); 206 - static void prb_retire_rx_blk_timer_expired(struct timer_list *); 207 - static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *); 206 + static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *); 208 207 static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); 209 208 static void prb_clear_rxhash(struct tpacket_kbdq_core *, 210 209 struct tpacket3_hdr *); ··· 578 579 return proto; 579 580 } 580 581 581 - static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) 582 - { 583 - timer_delete_sync(&pkc->retire_blk_timer); 584 - } 585 - 586 582 static void prb_shutdown_retire_blk_timer(struct packet_sock *po, 587 583 struct sk_buff_head *rb_queue) 588 584 { 589 585 struct tpacket_kbdq_core *pkc; 590 586 591 587 pkc = GET_PBDQC_FROM_RB(&po->rx_ring); 592 - 593 - spin_lock_bh(&rb_queue->lock); 594 - pkc->delete_blk_timer = 1; 595 - spin_unlock_bh(&rb_queue->lock); 596 - 597 - prb_del_retire_blk_timer(pkc); 598 - } 599 - 600 - static void prb_setup_retire_blk_timer(struct packet_sock *po) 601 - { 602 - struct tpacket_kbdq_core *pkc; 603 - 604 - pkc = GET_PBDQC_FROM_RB(&po->rx_ring); 605 - timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired, 606 - 0); 607 - pkc->retire_blk_timer.expires = jiffies; 588 + hrtimer_cancel(&pkc->retire_blk_timer); 608 589 } 609 590 610 591 static int prb_calc_retire_blk_tmo(struct packet_sock *po, ··· 650 671 p1->version = po->tp_version; 651 672 po->stats.stats3.tp_freeze_q_cnt = 0; 652 673 if (req_u->req3.tp_retire_blk_tov) 653 - p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; 674 + p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov); 654 675 else 655 - p1->retire_blk_tov = prb_calc_retire_blk_tmo(po, 656 - req_u->req3.tp_block_size); 657 - p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); 676 + p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po, 677 + req_u->req3.tp_block_size)); 658 678 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; 659 679 rwlock_init(&p1->blk_fill_in_prog_lock); 660 680 661 681 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); 662 682 prb_init_ft_ops(p1, req_u); 663 - prb_setup_retire_blk_timer(po); 683 + hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired, 684 + CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); 685 + hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime, 686 + HRTIMER_MODE_REL_SOFT); 664 687 prb_open_block(p1, pbd); 665 688 } 666 689 667 - /* Do NOT update the last_blk_num first. 668 - * Assumes sk_buff_head lock is held. 669 - */ 670 - static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) 671 - { 672 - mod_timer(&pkc->retire_blk_timer, 673 - jiffies + pkc->tov_in_jiffies); 674 - } 675 - 676 690 /* 677 - * Timer logic: 678 - * 1) We refresh the timer only when we open a block. 679 - * By doing this we don't waste cycles refreshing the timer 680 - * on packet-by-packet basis. 681 - * 682 691 * With a 1MB block-size, on a 1Gbps line, it will take 683 692 * i) ~8 ms to fill a block + ii) memcpy etc. 684 693 * In this cut we are not accounting for the memcpy time. 685 694 * 686 - * So, if the user sets the 'tmo' to 10ms then the timer 687 - * will never fire while the block is still getting filled 688 - * (which is what we want). However, the user could choose 689 - * to close a block early and that's fine. 690 - * 691 - * But when the timer does fire, we check whether or not to refresh it. 692 695 * Since the tmo granularity is in msecs, it is not too expensive 693 696 * to refresh the timer, lets say every '8' msecs. 694 697 * Either the user can set the 'tmo' or we can derive it based on 695 698 * a) line-speed and b) block-size. 696 699 * prb_calc_retire_blk_tmo() calculates the tmo. 697 - * 698 700 */ 699 - static void prb_retire_rx_blk_timer_expired(struct timer_list *t) 701 + static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t) 700 702 { 701 703 struct packet_sock *po = 702 704 timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer); ··· 689 729 690 730 frozen = prb_queue_frozen(pkc); 691 731 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); 692 - 693 - if (unlikely(pkc->delete_blk_timer)) 694 - goto out; 695 732 696 733 /* We only need to plug the race when the block is partially filled. 697 734 * tpacket_rcv: ··· 706 749 } 707 750 708 751 if (!frozen) { 709 - if (!BLOCK_NUM_PKTS(pbd)) { 710 - /* An empty block. Just refresh the timer. */ 711 - goto refresh_timer; 752 + if (BLOCK_NUM_PKTS(pbd)) { 753 + /* Not an empty block. Need retire the block. */ 754 + prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); 755 + prb_dispatch_next_block(pkc, po); 712 756 } 713 - prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); 714 - if (!prb_dispatch_next_block(pkc, po)) 715 - goto refresh_timer; 716 - else 717 - goto out; 718 757 } else { 719 758 /* Case 1. Queue was frozen because user-space was 720 759 * lagging behind. 721 760 */ 722 - if (prb_curr_blk_in_use(pbd)) { 723 - /* 724 - * Ok, user-space is still behind. 725 - * So just refresh the timer. 726 - */ 727 - goto refresh_timer; 728 - } else { 761 + if (!prb_curr_blk_in_use(pbd)) { 729 762 /* Case 2. queue was frozen,user-space caught up, 730 763 * now the link went idle && the timer fired. 731 764 * We don't have a block to close.So we open this ··· 724 777 * Thawing/timer-refresh is a side effect. 725 778 */ 726 779 prb_open_block(pkc, pbd); 727 - goto out; 728 780 } 729 781 } 730 782 731 - refresh_timer: 732 - _prb_refresh_rx_retire_blk_timer(pkc); 733 - 734 - out: 783 + hrtimer_forward_now(&pkc->retire_blk_timer, pkc->interval_ktime); 735 784 spin_unlock(&po->sk.sk_receive_queue.lock); 785 + return HRTIMER_RESTART; 736 786 } 737 787 738 788 static void prb_flush_block(struct tpacket_kbdq_core *pkc1, ··· 823 879 } 824 880 825 881 /* 826 - * Side effect of opening a block: 882 + * prb_open_block is called by tpacket_rcv or timer callback. 827 883 * 828 - * 1) prb_queue is thawed. 829 - * 2) retire_blk_timer is refreshed. 884 + * Reasons why NOT update hrtimer in prb_open_block: 885 + * 1) It will increase complexity to distinguish the two caller scenario. 886 + * 2) hrtimer_cancel and hrtimer_start need to be called if you want to update 887 + * TMO of an already enqueued hrtimer, leading to complex shutdown logic. 830 888 * 889 + * One side effect of NOT update hrtimer when called by tpacket_rcv is that 890 + * a newly opened block triggered by tpacket_rcv may be retired earlier than 891 + * expected. On the other hand, if timeout is updated in prb_open_block, the 892 + * frequent reception of network packets that leads to prb_open_block being 893 + * called may cause hrtimer to be removed and enqueued repeatedly. 831 894 */ 832 895 static void prb_open_block(struct tpacket_kbdq_core *pkc1, 833 896 struct tpacket_block_desc *pbd1) ··· 868 917 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; 869 918 870 919 prb_thaw_queue(pkc1); 871 - _prb_refresh_rx_retire_blk_timer(pkc1); 872 920 873 921 smp_wmb(); 874 922 }
+1 -1
net/packet/diag.c
··· 83 83 pdr.pdr_frame_nr = ring->frame_max + 1; 84 84 85 85 if (ver > TPACKET_V2) { 86 - pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov; 86 + pdr.pdr_retire_tmo = ktime_to_ms(ring->prb_bdqc.interval_ktime); 87 87 pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv; 88 88 pdr.pdr_features = ring->prb_bdqc.feature_req_word; 89 89 } else {
+5 -5
net/packet/internal.h
··· 20 20 unsigned int feature_req_word; 21 21 unsigned int hdrlen; 22 22 unsigned char reset_pending_on_curr_blk; 23 - unsigned char delete_blk_timer; 24 23 unsigned short kactive_blk_num; 25 24 unsigned short blk_sizeof_priv; 25 + 26 + unsigned short version; 26 27 27 28 char *pkblk_start; 28 29 char *pkblk_end; ··· 33 32 uint64_t knxt_seq_num; 34 33 char *prev; 35 34 char *nxt_offset; 35 + 36 36 struct sk_buff *skb; 37 37 38 38 rwlock_t blk_fill_in_prog_lock; ··· 41 39 /* Default is set to 8ms */ 42 40 #define DEFAULT_PRB_RETIRE_TOV (8) 43 41 44 - unsigned short retire_blk_tov; 45 - unsigned short version; 46 - unsigned long tov_in_jiffies; 42 + ktime_t interval_ktime; 47 43 48 44 /* timer to retire an outstanding block */ 49 - struct timer_list retire_blk_timer; 45 + struct hrtimer retire_blk_timer; 50 46 }; 51 47 52 48 struct pgv {