Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: Extend NAPI threaded polling to allow kthread based busy polling

Add a new state NAPI_STATE_THREADED_BUSY_POLL to the NAPI state enum to
enable and disable threaded busy polling.

When threaded busy polling is enabled for a NAPI, enable
NAPI_STATE_THREADED also.

When the threaded NAPI is scheduled, set NAPI_STATE_IN_BUSY_POLL to
signal napi_complete_done not to rearm interrupts.

Whenever NAPI_STATE_THREADED_BUSY_POLL is unset, the
NAPI_STATE_IN_BUSY_POLL will be unset, napi_complete_done unsets the
NAPI_STATE_SCHED_THREADED bit also, which in turn will make the kthread
go to sleep.

Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Acked-by: Martin Karsten <mkarsten@uwaterloo.ca>
Tested-by: Martin Karsten <mkarsten@uwaterloo.ca>
Link: https://patch.msgid.link/20251028203007.575686-2-skhawaja@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Samiullah Khawaja and committed by
Jakub Kicinski
c18d4b19 998b5d96

+109 -15
+3 -2
Documentation/netlink/specs/netdev.yaml
··· 88 88 - 89 89 name: napi-threaded 90 90 type: enum 91 - entries: [disabled, enabled] 91 + entries: [disabled, enabled, busy-poll] 92 92 93 93 attribute-sets: 94 94 - ··· 291 291 name: threaded 292 292 doc: Whether the NAPI is configured to operate in threaded polling 293 293 mode. If this is set to enabled then the NAPI context operates 294 - in threaded polling mode. 294 + in threaded polling mode. If this is set to busy-poll, then the 295 + threaded polling mode also busy polls. 295 296 type: u32 296 297 enum: napi-threaded 297 298 -
+49 -1
Documentation/networking/napi.rst
··· 263 263 Busy polling is enabled by either setting ``SO_BUSY_POLL`` on 264 264 selected sockets or using the global ``net.core.busy_poll`` and 265 265 ``net.core.busy_read`` sysctls. An io_uring API for NAPI busy polling 266 - also exists. 266 + also exists. Threaded polling of NAPI also has a mode to busy poll for 267 + packets (:ref:`threaded busy polling<threaded_busy_poll>`) using the NAPI 268 + processing kthread. 267 269 268 270 epoll-based busy polling 269 271 ------------------------ ··· 427 425 Therefore, setting ``gro_flush_timeout`` and ``napi_defer_hard_irqs`` is 428 426 the recommended usage, because otherwise setting ``irq-suspend-timeout`` 429 427 might not have any discernible effect. 428 + 429 + .. _threaded_busy_poll: 430 + 431 + Threaded NAPI busy polling 432 + -------------------------- 433 + 434 + Threaded NAPI busy polling extends threaded NAPI and adds support to do 435 + continuous busy polling of the NAPI. This can be useful for forwarding or 436 + AF_XDP applications. 437 + 438 + Threaded NAPI busy polling can be enabled on per NIC queue basis using Netlink. 439 + 440 + For example, using the following script: 441 + 442 + .. code-block:: bash 443 + 444 + $ ynl --family netdev --do napi-set \ 445 + --json='{"id": 66, "threaded": "busy-poll"}' 446 + 447 + The kernel will create a kthread that busy polls on this NAPI. 448 + 449 + The user may elect to set the CPU affinity of this kthread to an unused CPU 450 + core to improve how often the NAPI is polled at the expense of wasted CPU 451 + cycles. Note that this will keep the CPU core busy with 100% usage. 452 + 453 + Once threaded busy polling is enabled for a NAPI, PID of the kthread can be 454 + retrieved using Netlink so the affinity of the kthread can be set up. 455 + 456 + For example, the following script can be used to fetch the PID: 457 + 458 + .. code-block:: bash 459 + 460 + $ ynl --family netdev --do napi-get --json='{"id": 66}' 461 + 462 + This will output something like following, the pid `258` is the PID of the 463 + kthread that is polling this NAPI. 464 + 465 + .. code-block:: bash 466 + 467 + $ {'defer-hard-irqs': 0, 468 + 'gro-flush-timeout': 0, 469 + 'id': 66, 470 + 'ifindex': 2, 471 + 'irq-suspend-timeout': 0, 472 + 'pid': 258, 473 + 'threaded': 'busy-poll'} 430 474 431 475 .. _threaded: 432 476
+3 -1
include/linux/netdevice.h
··· 423 423 NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ 424 424 NAPI_STATE_LISTED, /* NAPI added to system lists */ 425 425 NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */ 426 - NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */ 426 + NAPI_STATE_IN_BUSY_POLL, /* Do not rearm NAPI interrupt */ 427 427 NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/ 428 428 NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/ 429 429 NAPI_STATE_SCHED_THREADED, /* Napi is currently scheduled in threaded mode */ 430 430 NAPI_STATE_HAS_NOTIFIER, /* Napi has an IRQ notifier */ 431 + NAPI_STATE_THREADED_BUSY_POLL, /* The threaded NAPI poller will busy poll */ 431 432 }; 432 433 433 434 enum { ··· 443 442 NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED), 444 443 NAPIF_STATE_SCHED_THREADED = BIT(NAPI_STATE_SCHED_THREADED), 445 444 NAPIF_STATE_HAS_NOTIFIER = BIT(NAPI_STATE_HAS_NOTIFIER), 445 + NAPIF_STATE_THREADED_BUSY_POLL = BIT(NAPI_STATE_THREADED_BUSY_POLL), 446 446 }; 447 447 448 448 enum gro_result {
+1
include/uapi/linux/netdev.h
··· 80 80 enum netdev_napi_threaded { 81 81 NETDEV_NAPI_THREADED_DISABLED, 82 82 NETDEV_NAPI_THREADED_ENABLED, 83 + NETDEV_NAPI_THREADED_BUSY_POLL, 83 84 }; 84 85 85 86 enum {
+48 -10
net/core/dev.c
··· 7089 7089 */ 7090 7090 if ((val & NAPIF_STATE_SCHED_THREADED) || 7091 7091 !(val & NAPIF_STATE_SCHED)) { 7092 - new = val & (~NAPIF_STATE_THREADED); 7092 + new = val & (~(NAPIF_STATE_THREADED | 7093 + NAPIF_STATE_THREADED_BUSY_POLL)); 7093 7094 } else { 7094 7095 msleep(20); 7095 7096 continue; ··· 7112 7111 7113 7112 kthread_stop(napi->thread); 7114 7113 napi->thread = NULL; 7114 + } 7115 + 7116 + static void napi_set_threaded_state(struct napi_struct *napi, 7117 + enum netdev_napi_threaded threaded_mode) 7118 + { 7119 + bool threaded = threaded_mode != NETDEV_NAPI_THREADED_DISABLED; 7120 + bool busy_poll = threaded_mode == NETDEV_NAPI_THREADED_BUSY_POLL; 7121 + 7122 + assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); 7123 + assign_bit(NAPI_STATE_THREADED_BUSY_POLL, &napi->state, busy_poll); 7115 7124 } 7116 7125 7117 7126 int napi_set_threaded(struct napi_struct *napi, ··· 7150 7139 } else { 7151 7140 /* Make sure kthread is created before THREADED bit is set. */ 7152 7141 smp_mb__before_atomic(); 7153 - assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); 7142 + napi_set_threaded_state(napi, threaded); 7154 7143 } 7155 7144 7156 7145 return 0; ··· 7542 7531 } 7543 7532 7544 7533 new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; 7545 - new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); 7534 + new &= ~(NAPIF_STATE_THREADED | 7535 + NAPIF_STATE_THREADED_BUSY_POLL | 7536 + NAPIF_STATE_PREFER_BUSY_POLL); 7546 7537 } while (!try_cmpxchg(&n->state, &val, new)); 7547 7538 7548 7539 hrtimer_cancel(&n->timer); ··· 7756 7743 return -1; 7757 7744 } 7758 7745 7759 - static void napi_threaded_poll_loop(struct napi_struct *napi) 7746 + static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll) 7760 7747 { 7761 7748 struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; 7762 7749 struct softnet_data *sd; ··· 7785 7772 } 7786 7773 skb_defer_free_flush(); 7787 7774 bpf_net_ctx_clear(bpf_net_ctx); 7775 + 7776 + /* When busy poll is enabled, the old packets are not flushed in 7777 + * napi_complete_done. So flush them here. 7778 + */ 7779 + if (busy_poll) 7780 + gro_flush_normal(&napi->gro, HZ >= 1000); 7788 7781 local_bh_enable(); 7782 + 7783 + /* Call cond_resched here to avoid watchdog warnings. */ 7784 + if (repoll || busy_poll) { 7785 + rcu_softirq_qs_periodic(last_qs); 7786 + cond_resched(); 7787 + } 7789 7788 7790 7789 if (!repoll) 7791 7790 break; 7792 - 7793 - rcu_softirq_qs_periodic(last_qs); 7794 - cond_resched(); 7795 7791 } 7796 7792 } 7797 7793 7798 7794 static int napi_threaded_poll(void *data) 7799 7795 { 7800 7796 struct napi_struct *napi = data; 7797 + bool want_busy_poll; 7798 + bool in_busy_poll; 7799 + unsigned long val; 7801 7800 7802 - while (!napi_thread_wait(napi)) 7803 - napi_threaded_poll_loop(napi); 7801 + while (!napi_thread_wait(napi)) { 7802 + val = READ_ONCE(napi->state); 7803 + 7804 + want_busy_poll = val & NAPIF_STATE_THREADED_BUSY_POLL; 7805 + in_busy_poll = val & NAPIF_STATE_IN_BUSY_POLL; 7806 + 7807 + if (unlikely(val & NAPIF_STATE_DISABLE)) 7808 + want_busy_poll = false; 7809 + 7810 + if (want_busy_poll != in_busy_poll) 7811 + assign_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state, 7812 + want_busy_poll); 7813 + 7814 + napi_threaded_poll_loop(napi, want_busy_poll); 7815 + } 7804 7816 7805 7817 return 0; 7806 7818 } ··· 13135 13097 { 13136 13098 struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); 13137 13099 13138 - napi_threaded_poll_loop(&sd->backlog); 13100 + napi_threaded_poll_loop(&sd->backlog, false); 13139 13101 } 13140 13102 13141 13103 static void backlog_napi_setup(unsigned int cpu)
+3
net/core/dev.h
··· 317 317 318 318 static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n) 319 319 { 320 + if (test_bit(NAPI_STATE_THREADED_BUSY_POLL, &n->state)) 321 + return NETDEV_NAPI_THREADED_BUSY_POLL; 322 + 320 323 if (test_bit(NAPI_STATE_THREADED, &n->state)) 321 324 return NETDEV_NAPI_THREADED_ENABLED; 322 325
+1 -1
net/core/netdev-genl-gen.c
··· 97 97 [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), 98 98 [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, 99 99 [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, 100 - [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 1), 100 + [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 2), 101 101 }; 102 102 103 103 /* NETDEV_CMD_BIND_TX - do */
+1
tools/include/uapi/linux/netdev.h
··· 80 80 enum netdev_napi_threaded { 81 81 NETDEV_NAPI_THREADED_DISABLED, 82 82 NETDEV_NAPI_THREADED_ENABLED, 83 + NETDEV_NAPI_THREADED_BUSY_POLL, 83 84 }; 84 85 85 86 enum {