Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * count the number of connections matching an arbitrary key.
4 *
5 * (C) 2017 Red Hat GmbH
6 * Author: Florian Westphal <fw@strlen.de>
7 *
8 * split from xt_connlimit.c:
9 * (c) 2000 Gerd Knorr <kraxel@bytesex.org>
10 * Nov 2002: Martin Bene <martin.bene@icomedias.com>:
11 * only ignore TIME_WAIT or gone connections
12 * (C) CC Computer Consultants GmbH, 2007
13 */
14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15#include <linux/in.h>
16#include <linux/in6.h>
17#include <linux/ip.h>
18#include <linux/ipv6.h>
19#include <linux/jhash.h>
20#include <linux/slab.h>
21#include <linux/list.h>
22#include <linux/rbtree.h>
23#include <linux/module.h>
24#include <linux/random.h>
25#include <linux/skbuff.h>
26#include <linux/spinlock.h>
27#include <linux/netfilter/nf_conntrack_tcp.h>
28#include <linux/netfilter/x_tables.h>
29#include <net/netfilter/nf_conntrack.h>
30#include <net/netfilter/nf_conntrack_count.h>
31#include <net/netfilter/nf_conntrack_core.h>
32#include <net/netfilter/nf_conntrack_tuple.h>
33#include <net/netfilter/nf_conntrack_zones.h>
34
35#define CONNCOUNT_SLOTS 256U
36
37#define CONNCOUNT_GC_MAX_NODES 8
38#define MAX_KEYLEN 5
39
40/* we will save the tuples of all connections we care about */
41struct nf_conncount_tuple {
42 struct list_head node;
43 struct nf_conntrack_tuple tuple;
44 struct nf_conntrack_zone zone;
45 int cpu;
46 u32 jiffies32;
47};
48
49struct nf_conncount_rb {
50 struct rb_node node;
51 struct nf_conncount_list list;
52 u32 key[MAX_KEYLEN];
53 struct rcu_head rcu_head;
54};
55
56static spinlock_t nf_conncount_locks[CONNCOUNT_SLOTS] __cacheline_aligned_in_smp;
57
58struct nf_conncount_data {
59 unsigned int keylen;
60 struct rb_root root[CONNCOUNT_SLOTS];
61 struct net *net;
62 struct work_struct gc_work;
63 unsigned long pending_trees[BITS_TO_LONGS(CONNCOUNT_SLOTS)];
64 unsigned int gc_tree;
65};
66
67static u_int32_t conncount_rnd __read_mostly;
68static struct kmem_cache *conncount_rb_cachep __read_mostly;
69static struct kmem_cache *conncount_conn_cachep __read_mostly;
70
71static inline bool already_closed(const struct nf_conn *conn)
72{
73 if (nf_ct_protonum(conn) == IPPROTO_TCP)
74 return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT ||
75 conn->proto.tcp.state == TCP_CONNTRACK_CLOSE;
76 else
77 return false;
78}
79
80static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
81{
82 return memcmp(a, b, klen * sizeof(u32));
83}
84
85static void conn_free(struct nf_conncount_list *list,
86 struct nf_conncount_tuple *conn)
87{
88 lockdep_assert_held(&list->list_lock);
89
90 list->count--;
91 list_del(&conn->node);
92
93 kmem_cache_free(conncount_conn_cachep, conn);
94}
95
96static const struct nf_conntrack_tuple_hash *
97find_or_evict(struct net *net, struct nf_conncount_list *list,
98 struct nf_conncount_tuple *conn)
99{
100 const struct nf_conntrack_tuple_hash *found;
101 unsigned long a, b;
102 int cpu = raw_smp_processor_id();
103 u32 age;
104
105 found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple);
106 if (found)
107 return found;
108 b = conn->jiffies32;
109 a = (u32)jiffies;
110
111 /* conn might have been added just before by another cpu and
112 * might still be unconfirmed. In this case, nf_conntrack_find()
113 * returns no result. Thus only evict if this cpu added the
114 * stale entry or if the entry is older than two jiffies.
115 */
116 age = a - b;
117 if (conn->cpu == cpu || age >= 2) {
118 conn_free(list, conn);
119 return ERR_PTR(-ENOENT);
120 }
121
122 return ERR_PTR(-EAGAIN);
123}
124
125static bool get_ct_or_tuple_from_skb(struct net *net,
126 const struct sk_buff *skb,
127 u16 l3num,
128 struct nf_conn **ct,
129 struct nf_conntrack_tuple *tuple,
130 const struct nf_conntrack_zone **zone,
131 bool *refcounted)
132{
133 const struct nf_conntrack_tuple_hash *h;
134 enum ip_conntrack_info ctinfo;
135 struct nf_conn *found_ct;
136
137 found_ct = nf_ct_get(skb, &ctinfo);
138 if (found_ct && !nf_ct_is_template(found_ct)) {
139 *tuple = found_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
140 *zone = nf_ct_zone(found_ct);
141 *ct = found_ct;
142 return true;
143 }
144
145 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num, net, tuple))
146 return false;
147
148 if (found_ct)
149 *zone = nf_ct_zone(found_ct);
150
151 h = nf_conntrack_find_get(net, *zone, tuple);
152 if (!h)
153 return true;
154
155 found_ct = nf_ct_tuplehash_to_ctrack(h);
156 *refcounted = true;
157 *ct = found_ct;
158
159 return true;
160}
161
162static int __nf_conncount_add(struct net *net,
163 const struct sk_buff *skb,
164 u16 l3num,
165 struct nf_conncount_list *list)
166{
167 const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
168 const struct nf_conntrack_tuple_hash *found;
169 struct nf_conncount_tuple *conn, *conn_n;
170 struct nf_conntrack_tuple tuple;
171 struct nf_conn *ct = NULL;
172 struct nf_conn *found_ct;
173 unsigned int collect = 0;
174 bool refcounted = false;
175 int err = 0;
176
177 if (!get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted))
178 return -ENOENT;
179
180 if (ct && nf_ct_is_confirmed(ct)) {
181 err = -EEXIST;
182 goto out_put;
183 }
184
185 if ((u32)jiffies == list->last_gc)
186 goto add_new_node;
187
188 /* check the saved connections */
189 list_for_each_entry_safe(conn, conn_n, &list->head, node) {
190 if (collect > CONNCOUNT_GC_MAX_NODES)
191 break;
192
193 found = find_or_evict(net, list, conn);
194 if (IS_ERR(found)) {
195 /* Not found, but might be about to be confirmed */
196 if (PTR_ERR(found) == -EAGAIN) {
197 if (nf_ct_tuple_equal(&conn->tuple, &tuple) &&
198 nf_ct_zone_id(&conn->zone, conn->zone.dir) ==
199 nf_ct_zone_id(zone, zone->dir))
200 goto out_put; /* already exists */
201 } else {
202 collect++;
203 }
204 continue;
205 }
206
207 found_ct = nf_ct_tuplehash_to_ctrack(found);
208
209 if (nf_ct_tuple_equal(&conn->tuple, &tuple) &&
210 nf_ct_zone_equal(found_ct, zone, zone->dir)) {
211 /*
212 * We should not see tuples twice unless someone hooks
213 * this into a table without "-p tcp --syn".
214 *
215 * Attempt to avoid a re-add in this case.
216 */
217 nf_ct_put(found_ct);
218 goto out_put;
219 } else if (already_closed(found_ct)) {
220 /*
221 * we do not care about connections which are
222 * closed already -> ditch it
223 */
224 nf_ct_put(found_ct);
225 conn_free(list, conn);
226 collect++;
227 continue;
228 }
229
230 nf_ct_put(found_ct);
231 }
232 list->last_gc = (u32)jiffies;
233
234add_new_node:
235 if (WARN_ON_ONCE(list->count > INT_MAX)) {
236 err = -EOVERFLOW;
237 goto out_put;
238 }
239
240 conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
241 if (conn == NULL) {
242 err = -ENOMEM;
243 goto out_put;
244 }
245
246 conn->tuple = tuple;
247 conn->zone = *zone;
248 conn->cpu = raw_smp_processor_id();
249 conn->jiffies32 = (u32)jiffies;
250 list_add_tail(&conn->node, &list->head);
251 list->count++;
252
253out_put:
254 if (refcounted)
255 nf_ct_put(ct);
256 return err;
257}
258
259int nf_conncount_add_skb(struct net *net,
260 const struct sk_buff *skb,
261 u16 l3num,
262 struct nf_conncount_list *list)
263{
264 int ret;
265
266 /* check the saved connections */
267 spin_lock_bh(&list->list_lock);
268 ret = __nf_conncount_add(net, skb, l3num, list);
269 spin_unlock_bh(&list->list_lock);
270
271 return ret;
272}
273EXPORT_SYMBOL_GPL(nf_conncount_add_skb);
274
275void nf_conncount_list_init(struct nf_conncount_list *list)
276{
277 spin_lock_init(&list->list_lock);
278 INIT_LIST_HEAD(&list->head);
279 list->count = 0;
280 list->last_gc = (u32)jiffies;
281}
282EXPORT_SYMBOL_GPL(nf_conncount_list_init);
283
284/* Return true if the list is empty. Must be called with BH disabled. */
285static bool __nf_conncount_gc_list(struct net *net,
286 struct nf_conncount_list *list)
287{
288 const struct nf_conntrack_tuple_hash *found;
289 struct nf_conncount_tuple *conn, *conn_n;
290 struct nf_conn *found_ct;
291 unsigned int collected = 0;
292 bool ret = false;
293
294 /* don't bother if we just did GC */
295 if ((u32)jiffies == READ_ONCE(list->last_gc))
296 return false;
297
298 list_for_each_entry_safe(conn, conn_n, &list->head, node) {
299 found = find_or_evict(net, list, conn);
300 if (IS_ERR(found)) {
301 if (PTR_ERR(found) == -ENOENT)
302 collected++;
303 continue;
304 }
305
306 found_ct = nf_ct_tuplehash_to_ctrack(found);
307 if (already_closed(found_ct)) {
308 /*
309 * we do not care about connections which are
310 * closed already -> ditch it
311 */
312 nf_ct_put(found_ct);
313 conn_free(list, conn);
314 collected++;
315 continue;
316 }
317
318 nf_ct_put(found_ct);
319 if (collected > CONNCOUNT_GC_MAX_NODES)
320 break;
321 }
322
323 if (!list->count)
324 ret = true;
325 list->last_gc = (u32)jiffies;
326
327 return ret;
328}
329
330bool nf_conncount_gc_list(struct net *net,
331 struct nf_conncount_list *list)
332{
333 bool ret;
334
335 /* don't bother if other cpu is already doing GC */
336 if (!spin_trylock_bh(&list->list_lock))
337 return false;
338
339 ret = __nf_conncount_gc_list(net, list);
340 spin_unlock_bh(&list->list_lock);
341
342 return ret;
343}
344EXPORT_SYMBOL_GPL(nf_conncount_gc_list);
345
346static void __tree_nodes_free(struct rcu_head *h)
347{
348 struct nf_conncount_rb *rbconn;
349
350 rbconn = container_of(h, struct nf_conncount_rb, rcu_head);
351 kmem_cache_free(conncount_rb_cachep, rbconn);
352}
353
354/* caller must hold tree nf_conncount_locks[] lock */
355static void tree_nodes_free(struct rb_root *root,
356 struct nf_conncount_rb *gc_nodes[],
357 unsigned int gc_count)
358{
359 struct nf_conncount_rb *rbconn;
360
361 while (gc_count) {
362 rbconn = gc_nodes[--gc_count];
363 spin_lock(&rbconn->list.list_lock);
364 if (!rbconn->list.count) {
365 rb_erase(&rbconn->node, root);
366 call_rcu(&rbconn->rcu_head, __tree_nodes_free);
367 }
368 spin_unlock(&rbconn->list.list_lock);
369 }
370}
371
372static void schedule_gc_worker(struct nf_conncount_data *data, int tree)
373{
374 set_bit(tree, data->pending_trees);
375 schedule_work(&data->gc_work);
376}
377
378static unsigned int
379insert_tree(struct net *net,
380 const struct sk_buff *skb,
381 u16 l3num,
382 struct nf_conncount_data *data,
383 struct rb_root *root,
384 unsigned int hash,
385 const u32 *key)
386{
387 struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
388 const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
389 bool do_gc = true, refcounted = false;
390 unsigned int count = 0, gc_count = 0;
391 struct rb_node **rbnode, *parent;
392 struct nf_conntrack_tuple tuple;
393 struct nf_conncount_tuple *conn;
394 struct nf_conncount_rb *rbconn;
395 struct nf_conn *ct = NULL;
396
397 spin_lock_bh(&nf_conncount_locks[hash]);
398restart:
399 parent = NULL;
400 rbnode = &(root->rb_node);
401 while (*rbnode) {
402 int diff;
403 rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node);
404
405 parent = *rbnode;
406 diff = key_diff(key, rbconn->key, data->keylen);
407 if (diff < 0) {
408 rbnode = &((*rbnode)->rb_left);
409 } else if (diff > 0) {
410 rbnode = &((*rbnode)->rb_right);
411 } else {
412 int ret;
413
414 ret = nf_conncount_add_skb(net, skb, l3num, &rbconn->list);
415 if (ret && ret != -EEXIST)
416 count = 0; /* hotdrop */
417 else
418 count = rbconn->list.count;
419 tree_nodes_free(root, gc_nodes, gc_count);
420 goto out_unlock;
421 }
422
423 if (gc_count >= ARRAY_SIZE(gc_nodes))
424 continue;
425
426 if (do_gc && nf_conncount_gc_list(net, &rbconn->list))
427 gc_nodes[gc_count++] = rbconn;
428 }
429
430 if (gc_count) {
431 tree_nodes_free(root, gc_nodes, gc_count);
432 schedule_gc_worker(data, hash);
433 gc_count = 0;
434 do_gc = false;
435 goto restart;
436 }
437
438 if (get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) {
439 /* expected case: match, insert new node */
440 rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
441 if (rbconn == NULL)
442 goto out_unlock;
443
444 conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
445 if (conn == NULL) {
446 kmem_cache_free(conncount_rb_cachep, rbconn);
447 goto out_unlock;
448 }
449
450 conn->tuple = tuple;
451 conn->zone = *zone;
452 conn->cpu = raw_smp_processor_id();
453 conn->jiffies32 = (u32)jiffies;
454 memcpy(rbconn->key, key, sizeof(u32) * data->keylen);
455
456 nf_conncount_list_init(&rbconn->list);
457 list_add(&conn->node, &rbconn->list.head);
458 count = 1;
459 rbconn->list.count = count;
460
461 rb_link_node_rcu(&rbconn->node, parent, rbnode);
462 rb_insert_color(&rbconn->node, root);
463 }
464out_unlock:
465 if (refcounted)
466 nf_ct_put(ct);
467 spin_unlock_bh(&nf_conncount_locks[hash]);
468 return count;
469}
470
471static unsigned int
472count_tree(struct net *net,
473 const struct sk_buff *skb,
474 u16 l3num,
475 struct nf_conncount_data *data,
476 const u32 *key)
477{
478 struct rb_root *root;
479 struct rb_node *parent;
480 struct nf_conncount_rb *rbconn;
481 unsigned int hash;
482
483 hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS;
484 root = &data->root[hash];
485
486 parent = rcu_dereference_raw(root->rb_node);
487 while (parent) {
488 int diff;
489
490 rbconn = rb_entry(parent, struct nf_conncount_rb, node);
491
492 diff = key_diff(key, rbconn->key, data->keylen);
493 if (diff < 0) {
494 parent = rcu_dereference_raw(parent->rb_left);
495 } else if (diff > 0) {
496 parent = rcu_dereference_raw(parent->rb_right);
497 } else {
498 int ret;
499
500 if (!skb) {
501 nf_conncount_gc_list(net, &rbconn->list);
502 return rbconn->list.count;
503 }
504
505 spin_lock_bh(&rbconn->list.list_lock);
506 /* Node might be about to be free'd.
507 * We need to defer to insert_tree() in this case.
508 */
509 if (rbconn->list.count == 0) {
510 spin_unlock_bh(&rbconn->list.list_lock);
511 break;
512 }
513
514 /* same source network -> be counted! */
515 ret = __nf_conncount_add(net, skb, l3num, &rbconn->list);
516 spin_unlock_bh(&rbconn->list.list_lock);
517 if (ret && ret != -EEXIST) {
518 return 0; /* hotdrop */
519 } else {
520 /* -EEXIST means add was skipped, update the list */
521 if (ret == -EEXIST)
522 nf_conncount_gc_list(net, &rbconn->list);
523 return rbconn->list.count;
524 }
525 }
526 }
527
528 if (!skb)
529 return 0;
530
531 return insert_tree(net, skb, l3num, data, root, hash, key);
532}
533
534static void tree_gc_worker(struct work_struct *work)
535{
536 struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work);
537 struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn;
538 struct rb_root *root;
539 struct rb_node *node;
540 unsigned int tree, next_tree, gc_count = 0;
541
542 tree = data->gc_tree % CONNCOUNT_SLOTS;
543 root = &data->root[tree];
544
545 local_bh_disable();
546 rcu_read_lock();
547 for (node = rb_first(root); node != NULL; node = rb_next(node)) {
548 rbconn = rb_entry(node, struct nf_conncount_rb, node);
549 if (nf_conncount_gc_list(data->net, &rbconn->list))
550 gc_count++;
551 }
552 rcu_read_unlock();
553 local_bh_enable();
554
555 cond_resched();
556
557 spin_lock_bh(&nf_conncount_locks[tree]);
558 if (gc_count < ARRAY_SIZE(gc_nodes))
559 goto next; /* do not bother */
560
561 gc_count = 0;
562 node = rb_first(root);
563 while (node != NULL) {
564 rbconn = rb_entry(node, struct nf_conncount_rb, node);
565 node = rb_next(node);
566
567 if (rbconn->list.count > 0)
568 continue;
569
570 gc_nodes[gc_count++] = rbconn;
571 if (gc_count >= ARRAY_SIZE(gc_nodes)) {
572 tree_nodes_free(root, gc_nodes, gc_count);
573 gc_count = 0;
574 }
575 }
576
577 tree_nodes_free(root, gc_nodes, gc_count);
578next:
579 clear_bit(tree, data->pending_trees);
580
581 next_tree = (tree + 1) % CONNCOUNT_SLOTS;
582 next_tree = find_next_bit(data->pending_trees, CONNCOUNT_SLOTS, next_tree);
583
584 if (next_tree < CONNCOUNT_SLOTS) {
585 data->gc_tree = next_tree;
586 schedule_work(work);
587 }
588
589 spin_unlock_bh(&nf_conncount_locks[tree]);
590}
591
592/* Count and return number of conntrack entries in 'net' with particular 'key'.
593 * If 'skb' is not null, insert the corresponding tuple into the accounting
594 * data structure. Call with RCU read lock.
595 */
596unsigned int nf_conncount_count_skb(struct net *net,
597 const struct sk_buff *skb,
598 u16 l3num,
599 struct nf_conncount_data *data,
600 const u32 *key)
601{
602 return count_tree(net, skb, l3num, data, key);
603
604}
605EXPORT_SYMBOL_GPL(nf_conncount_count_skb);
606
607struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen)
608{
609 struct nf_conncount_data *data;
610 int i;
611
612 if (keylen % sizeof(u32) ||
613 keylen / sizeof(u32) > MAX_KEYLEN ||
614 keylen == 0)
615 return ERR_PTR(-EINVAL);
616
617 net_get_random_once(&conncount_rnd, sizeof(conncount_rnd));
618
619 data = kmalloc(sizeof(*data), GFP_KERNEL);
620 if (!data)
621 return ERR_PTR(-ENOMEM);
622
623 for (i = 0; i < ARRAY_SIZE(data->root); ++i)
624 data->root[i] = RB_ROOT;
625
626 data->keylen = keylen / sizeof(u32);
627 data->net = net;
628 INIT_WORK(&data->gc_work, tree_gc_worker);
629
630 return data;
631}
632EXPORT_SYMBOL_GPL(nf_conncount_init);
633
634void nf_conncount_cache_free(struct nf_conncount_list *list)
635{
636 struct nf_conncount_tuple *conn, *conn_n;
637
638 list_for_each_entry_safe(conn, conn_n, &list->head, node)
639 kmem_cache_free(conncount_conn_cachep, conn);
640}
641EXPORT_SYMBOL_GPL(nf_conncount_cache_free);
642
643static void destroy_tree(struct rb_root *r)
644{
645 struct nf_conncount_rb *rbconn;
646 struct rb_node *node;
647
648 while ((node = rb_first(r)) != NULL) {
649 rbconn = rb_entry(node, struct nf_conncount_rb, node);
650
651 rb_erase(node, r);
652
653 nf_conncount_cache_free(&rbconn->list);
654
655 kmem_cache_free(conncount_rb_cachep, rbconn);
656 }
657}
658
659void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data)
660{
661 unsigned int i;
662
663 cancel_work_sync(&data->gc_work);
664
665 for (i = 0; i < ARRAY_SIZE(data->root); ++i)
666 destroy_tree(&data->root[i]);
667
668 kfree(data);
669}
670EXPORT_SYMBOL_GPL(nf_conncount_destroy);
671
672static int __init nf_conncount_modinit(void)
673{
674 int i;
675
676 for (i = 0; i < CONNCOUNT_SLOTS; ++i)
677 spin_lock_init(&nf_conncount_locks[i]);
678
679 conncount_conn_cachep = KMEM_CACHE(nf_conncount_tuple, 0);
680 if (!conncount_conn_cachep)
681 return -ENOMEM;
682
683 conncount_rb_cachep = KMEM_CACHE(nf_conncount_rb, 0);
684 if (!conncount_rb_cachep) {
685 kmem_cache_destroy(conncount_conn_cachep);
686 return -ENOMEM;
687 }
688
689 return 0;
690}
691
692static void __exit nf_conncount_modexit(void)
693{
694 kmem_cache_destroy(conncount_conn_cachep);
695 kmem_cache_destroy(conncount_rb_cachep);
696}
697
698module_init(nf_conncount_modinit);
699module_exit(nf_conncount_modexit);
700MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
701MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
702MODULE_DESCRIPTION("netfilter: count number of connections matching a key");
703MODULE_LICENSE("GPL");