Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2
3#include "io_uring.h"
4#include "napi.h"
5
6#ifdef CONFIG_NET_RX_BUSY_POLL
7
8/* Timeout for cleanout of stale entries. */
9#define NAPI_TIMEOUT (60 * SEC_CONVERSION)
10
11struct io_napi_entry {
12 unsigned int napi_id;
13 struct list_head list;
14
15 unsigned long timeout;
16 struct hlist_node node;
17
18 struct rcu_head rcu;
19};
20
21static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list,
22 unsigned int napi_id)
23{
24 struct io_napi_entry *e;
25
26 hlist_for_each_entry_rcu(e, hash_list, node) {
27 if (e->napi_id != napi_id)
28 continue;
29 e->timeout = jiffies + NAPI_TIMEOUT;
30 return e;
31 }
32
33 return NULL;
34}
35
36static inline ktime_t net_to_ktime(unsigned long t)
37{
38 /* napi approximating usecs, reverse busy_loop_current_time */
39 return ns_to_ktime(t << 10);
40}
41
42void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock)
43{
44 struct hlist_head *hash_list;
45 unsigned int napi_id;
46 struct sock *sk;
47 struct io_napi_entry *e;
48
49 sk = sock->sk;
50 if (!sk)
51 return;
52
53 napi_id = READ_ONCE(sk->sk_napi_id);
54
55 /* Non-NAPI IDs can be rejected. */
56 if (napi_id < MIN_NAPI_ID)
57 return;
58
59 hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
60
61 rcu_read_lock();
62 e = io_napi_hash_find(hash_list, napi_id);
63 if (e) {
64 e->timeout = jiffies + NAPI_TIMEOUT;
65 rcu_read_unlock();
66 return;
67 }
68 rcu_read_unlock();
69
70 e = kmalloc(sizeof(*e), GFP_NOWAIT);
71 if (!e)
72 return;
73
74 e->napi_id = napi_id;
75 e->timeout = jiffies + NAPI_TIMEOUT;
76
77 spin_lock(&ctx->napi_lock);
78 if (unlikely(io_napi_hash_find(hash_list, napi_id))) {
79 spin_unlock(&ctx->napi_lock);
80 kfree(e);
81 return;
82 }
83
84 hlist_add_tail_rcu(&e->node, hash_list);
85 list_add_tail(&e->list, &ctx->napi_list);
86 spin_unlock(&ctx->napi_lock);
87}
88
89static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
90{
91 struct io_napi_entry *e;
92 unsigned int i;
93
94 spin_lock(&ctx->napi_lock);
95 hash_for_each(ctx->napi_ht, i, e, node) {
96 if (time_after(jiffies, e->timeout)) {
97 list_del(&e->list);
98 hash_del_rcu(&e->node);
99 kfree_rcu(e, rcu);
100 }
101 }
102 spin_unlock(&ctx->napi_lock);
103}
104
105static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale)
106{
107 if (is_stale)
108 __io_napi_remove_stale(ctx);
109}
110
111static inline bool io_napi_busy_loop_timeout(ktime_t start_time,
112 ktime_t bp)
113{
114 if (bp) {
115 ktime_t end_time = ktime_add(start_time, bp);
116 ktime_t now = net_to_ktime(busy_loop_current_time());
117
118 return ktime_after(now, end_time);
119 }
120
121 return true;
122}
123
124static bool io_napi_busy_loop_should_end(void *data,
125 unsigned long start_time)
126{
127 struct io_wait_queue *iowq = data;
128
129 if (signal_pending(current))
130 return true;
131 if (io_should_wake(iowq) || io_has_work(iowq->ctx))
132 return true;
133 if (io_napi_busy_loop_timeout(net_to_ktime(start_time),
134 iowq->napi_busy_poll_dt))
135 return true;
136
137 return false;
138}
139
140static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
141 void *loop_end_arg)
142{
143 struct io_napi_entry *e;
144 bool (*loop_end)(void *, unsigned long) = NULL;
145 bool is_stale = false;
146
147 if (loop_end_arg)
148 loop_end = io_napi_busy_loop_should_end;
149
150 list_for_each_entry_rcu(e, &ctx->napi_list, list) {
151 napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
152 ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
153
154 if (time_after(jiffies, e->timeout))
155 is_stale = true;
156 }
157
158 return is_stale;
159}
160
161static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
162 struct io_wait_queue *iowq)
163{
164 unsigned long start_time = busy_loop_current_time();
165 void *loop_end_arg = NULL;
166 bool is_stale = false;
167
168 /* Singular lists use a different napi loop end check function and are
169 * only executed once.
170 */
171 if (list_is_singular(&ctx->napi_list))
172 loop_end_arg = iowq;
173
174 rcu_read_lock();
175 do {
176 is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg);
177 } while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg);
178 rcu_read_unlock();
179
180 io_napi_remove_stale(ctx, is_stale);
181}
182
183/*
184 * io_napi_init() - Init napi settings
185 * @ctx: pointer to io-uring context structure
186 *
187 * Init napi settings in the io-uring context.
188 */
189void io_napi_init(struct io_ring_ctx *ctx)
190{
191 u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC;
192
193 INIT_LIST_HEAD(&ctx->napi_list);
194 spin_lock_init(&ctx->napi_lock);
195 ctx->napi_prefer_busy_poll = false;
196 ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt);
197}
198
199/*
200 * io_napi_free() - Deallocate napi
201 * @ctx: pointer to io-uring context structure
202 *
203 * Free the napi list and the hash table in the io-uring context.
204 */
205void io_napi_free(struct io_ring_ctx *ctx)
206{
207 struct io_napi_entry *e;
208 unsigned int i;
209
210 spin_lock(&ctx->napi_lock);
211 hash_for_each(ctx->napi_ht, i, e, node) {
212 hash_del_rcu(&e->node);
213 kfree_rcu(e, rcu);
214 }
215 spin_unlock(&ctx->napi_lock);
216}
217
218/*
219 * io_napi_register() - Register napi with io-uring
220 * @ctx: pointer to io-uring context structure
221 * @arg: pointer to io_uring_napi structure
222 *
223 * Register napi in the io-uring context.
224 */
225int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
226{
227 const struct io_uring_napi curr = {
228 .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt),
229 .prefer_busy_poll = ctx->napi_prefer_busy_poll
230 };
231 struct io_uring_napi napi;
232
233 if (ctx->flags & IORING_SETUP_IOPOLL)
234 return -EINVAL;
235 if (copy_from_user(&napi, arg, sizeof(napi)))
236 return -EFAULT;
237 if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv)
238 return -EINVAL;
239
240 if (copy_to_user(arg, &curr, sizeof(curr)))
241 return -EFAULT;
242
243 WRITE_ONCE(ctx->napi_busy_poll_dt, napi.busy_poll_to * NSEC_PER_USEC);
244 WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll);
245 WRITE_ONCE(ctx->napi_enabled, true);
246 return 0;
247}
248
249/*
250 * io_napi_unregister() - Unregister napi with io-uring
251 * @ctx: pointer to io-uring context structure
252 * @arg: pointer to io_uring_napi structure
253 *
254 * Unregister napi. If arg has been specified copy the busy poll timeout and
255 * prefer busy poll setting to the passed in structure.
256 */
257int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
258{
259 const struct io_uring_napi curr = {
260 .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt),
261 .prefer_busy_poll = ctx->napi_prefer_busy_poll
262 };
263
264 if (arg && copy_to_user(arg, &curr, sizeof(curr)))
265 return -EFAULT;
266
267 WRITE_ONCE(ctx->napi_busy_poll_dt, 0);
268 WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
269 WRITE_ONCE(ctx->napi_enabled, false);
270 return 0;
271}
272
273/*
274 * __io_napi_adjust_timeout() - adjust busy loop timeout
275 * @ctx: pointer to io-uring context structure
276 * @iowq: pointer to io wait queue
277 * @ts: pointer to timespec or NULL
278 *
279 * Adjust the busy loop timeout according to timespec and busy poll timeout.
280 * If the specified NAPI timeout is bigger than the wait timeout, then adjust
281 * the NAPI timeout accordingly.
282 */
283void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
284 ktime_t to_wait)
285{
286 ktime_t poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);
287
288 if (to_wait)
289 poll_dt = min(poll_dt, to_wait);
290
291 iowq->napi_busy_poll_dt = poll_dt;
292}
293
294/*
295 * __io_napi_busy_loop() - execute busy poll loop
296 * @ctx: pointer to io-uring context structure
297 * @iowq: pointer to io wait queue
298 *
299 * Execute the busy poll loop and merge the spliced off list.
300 */
301void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
302{
303 iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
304
305 if (!(ctx->flags & IORING_SETUP_SQPOLL) && ctx->napi_enabled)
306 io_napi_blocking_busy_loop(ctx, iowq);
307}
308
309/*
310 * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll
311 * @ctx: pointer to io-uring context structure
312 *
313 * Splice of the napi list and execute the napi busy poll loop.
314 */
315int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
316{
317 bool is_stale = false;
318
319 if (!READ_ONCE(ctx->napi_busy_poll_dt))
320 return 0;
321 if (list_empty_careful(&ctx->napi_list))
322 return 0;
323
324 rcu_read_lock();
325 is_stale = __io_napi_do_busy_loop(ctx, NULL);
326 rcu_read_unlock();
327
328 io_napi_remove_stale(ctx, is_stale);
329 return 1;
330}
331
332#endif