Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: Add SO_BUSY_POLL_BUDGET socket option

This option lets a user set a per socket NAPI budget for
busy-polling. If the options is not set, it will use the default of 8.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/bpf/20201130185205.196029-3-bjorn.topel@gmail.com

authored by

Björn Töpel and committed by
Daniel Borkmann
7c951caf 7fd3253a

+34 -14
+1
arch/alpha/include/uapi/asm/socket.h
··· 125 125 #define SO_DETACH_REUSEPORT_BPF 68 126 126 127 127 #define SO_PREFER_BUSY_POLL 69 128 + #define SO_BUSY_POLL_BUDGET 70 128 129 129 130 #if !defined(__KERNEL__) 130 131
+1
arch/mips/include/uapi/asm/socket.h
··· 136 136 #define SO_DETACH_REUSEPORT_BPF 68 137 137 138 138 #define SO_PREFER_BUSY_POLL 69 139 + #define SO_BUSY_POLL_BUDGET 70 139 140 140 141 #if !defined(__KERNEL__) 141 142
+1
arch/parisc/include/uapi/asm/socket.h
··· 117 117 #define SO_DETACH_REUSEPORT_BPF 0x4042 118 118 119 119 #define SO_PREFER_BUSY_POLL 0x4043 120 + #define SO_BUSY_POLL_BUDGET 0x4044 120 121 121 122 #if !defined(__KERNEL__) 122 123
+1
arch/sparc/include/uapi/asm/socket.h
··· 118 118 #define SO_DETACH_REUSEPORT_BPF 0x0047 119 119 120 120 #define SO_PREFER_BUSY_POLL 0x0048 121 + #define SO_BUSY_POLL_BUDGET 0x0049 121 122 122 123 #if !defined(__KERNEL__) 123 124
+2 -1
fs/eventpoll.c
··· 397 397 unsigned int napi_id = READ_ONCE(ep->napi_id); 398 398 399 399 if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) 400 - napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false); 400 + napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false, 401 + BUSY_POLL_BUDGET); 401 402 } 402 403 403 404 static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
+5 -2
include/net/busy_poll.h
··· 23 23 */ 24 24 #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) 25 25 26 + #define BUSY_POLL_BUDGET 8 27 + 26 28 #ifdef CONFIG_NET_RX_BUSY_POLL 27 29 28 30 struct napi_struct; ··· 45 43 46 44 void napi_busy_loop(unsigned int napi_id, 47 45 bool (*loop_end)(void *, unsigned long), 48 - void *loop_end_arg, bool prefer_busy_poll); 46 + void *loop_end_arg, bool prefer_busy_poll, u16 budget); 49 47 50 48 #else /* CONFIG_NET_RX_BUSY_POLL */ 51 49 static inline unsigned long net_busy_loop_on(void) ··· 108 106 109 107 if (napi_id >= MIN_NAPI_ID) 110 108 napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, 111 - READ_ONCE(sk->sk_prefer_busy_poll)); 109 + READ_ONCE(sk->sk_prefer_busy_poll), 110 + READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET); 112 111 #endif 113 112 } 114 113
+2
include/net/sock.h
··· 302 302 * @sk_max_ack_backlog: listen backlog set in listen() 303 303 * @sk_uid: user id of owner 304 304 * @sk_prefer_busy_poll: prefer busypolling over softirq processing 305 + * @sk_busy_poll_budget: napi processing budget when busypolling 305 306 * @sk_priority: %SO_PRIORITY setting 306 307 * @sk_type: socket type (%SOCK_STREAM, etc) 307 308 * @sk_protocol: which protocol this socket belongs in this network family ··· 483 482 kuid_t sk_uid; 484 483 #ifdef CONFIG_NET_RX_BUSY_POLL 485 484 u8 sk_prefer_busy_poll; 485 + u16 sk_busy_poll_budget; 486 486 #endif 487 487 struct pid *sk_peer_pid; 488 488 const struct cred *sk_peer_cred;
+1
include/uapi/asm-generic/socket.h
··· 120 120 #define SO_DETACH_REUSEPORT_BPF 68 121 121 122 122 #define SO_PREFER_BUSY_POLL 69 123 + #define SO_BUSY_POLL_BUDGET 70 123 124 124 125 #if !defined(__KERNEL__) 125 126
+10 -11
net/core/dev.c
··· 6496 6496 6497 6497 #if defined(CONFIG_NET_RX_BUSY_POLL) 6498 6498 6499 - #define BUSY_POLL_BUDGET 8 6500 - 6501 6499 static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) 6502 6500 { 6503 6501 if (!skip_schedule) { ··· 6515 6517 clear_bit(NAPI_STATE_SCHED, &napi->state); 6516 6518 } 6517 6519 6518 - static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll) 6520 + static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll, 6521 + u16 budget) 6519 6522 { 6520 6523 bool skip_schedule = false; 6521 6524 unsigned long timeout; ··· 6548 6549 /* All we really want here is to re-enable device interrupts. 6549 6550 * Ideally, a new ndo_busy_poll_stop() could avoid another round. 6550 6551 */ 6551 - rc = napi->poll(napi, BUSY_POLL_BUDGET); 6552 + rc = napi->poll(napi, budget); 6552 6553 /* We can't gro_normal_list() here, because napi->poll() might have 6553 6554 * rearmed the napi (napi_complete_done()) in which case it could 6554 6555 * already be running on another CPU. 6555 6556 */ 6556 - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); 6557 + trace_napi_poll(napi, rc, budget); 6557 6558 netpoll_poll_unlock(have_poll_lock); 6558 - if (rc == BUSY_POLL_BUDGET) 6559 + if (rc == budget) 6559 6560 __busy_poll_stop(napi, skip_schedule); 6560 6561 local_bh_enable(); 6561 6562 } 6562 6563 6563 6564 void napi_busy_loop(unsigned int napi_id, 6564 6565 bool (*loop_end)(void *, unsigned long), 6565 - void *loop_end_arg, bool prefer_busy_poll) 6566 + void *loop_end_arg, bool prefer_busy_poll, u16 budget) 6566 6567 { 6567 6568 unsigned long start_time = loop_end ? busy_loop_current_time() : 0; 6568 6569 int (*napi_poll)(struct napi_struct *napi, int budget); ··· 6605 6606 have_poll_lock = netpoll_poll_lock(napi); 6606 6607 napi_poll = napi->poll; 6607 6608 } 6608 - work = napi_poll(napi, BUSY_POLL_BUDGET); 6609 - trace_napi_poll(napi, work, BUSY_POLL_BUDGET); 6609 + work = napi_poll(napi, budget); 6610 + trace_napi_poll(napi, work, budget); 6610 6611 gro_normal_list(napi); 6611 6612 count: 6612 6613 if (work > 0) ··· 6619 6620 6620 6621 if (unlikely(need_resched())) { 6621 6622 if (napi_poll) 6622 - busy_poll_stop(napi, have_poll_lock, prefer_busy_poll); 6623 + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); 6623 6624 preempt_enable(); 6624 6625 rcu_read_unlock(); 6625 6626 cond_resched(); ··· 6630 6631 cpu_relax(); 6631 6632 } 6632 6633 if (napi_poll) 6633 - busy_poll_stop(napi, have_poll_lock, prefer_busy_poll); 6634 + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); 6634 6635 preempt_enable(); 6635 6636 out: 6636 6637 rcu_read_unlock();
+10
net/core/sock.c
··· 1165 1165 else 1166 1166 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); 1167 1167 break; 1168 + case SO_BUSY_POLL_BUDGET: 1169 + if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { 1170 + ret = -EPERM; 1171 + } else { 1172 + if (val < 0 || val > U16_MAX) 1173 + ret = -EINVAL; 1174 + else 1175 + WRITE_ONCE(sk->sk_busy_poll_budget, val); 1176 + } 1177 + break; 1168 1178 #endif 1169 1179 1170 1180 case SO_MAX_PACING_RATE: