Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search

This patch is originally from Florian Westphal.

This patch does the following 3 main tasks.

1) Add list lock to 'struct nf_conncount_list' so that we can
alter the lists containing the individual connections without holding the
main tree lock. It would be useful when we only need to add/remove to/from
a list without allocate/remove a node in the tree. With this change, we
update nft_connlimit accordingly since we longer need to maintain
a list lock in nft_connlimit now.

2) Use RCU for the initial tree search to improve tree look up performance.

3) Add a garbage collection worker. This worker is schedule when there
are excessive tree node that needed to be recycled.

Moreover,the rbnode reclaim logic is moved from search tree to insert tree
to avoid race condition.

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

authored by

Yi-Hung Wei and committed by
Pablo Neira Ayuso
5c789e13 34848d5c

+196 -91
+13 -4
include/net/netfilter/nf_conntrack_count.h
··· 5 5 6 6 struct nf_conncount_data; 7 7 8 + enum nf_conncount_list_add { 9 + NF_CONNCOUNT_ADDED, /* list add was ok */ 10 + NF_CONNCOUNT_ERR, /* -ENOMEM, must drop skb */ 11 + NF_CONNCOUNT_SKIP, /* list is already reclaimed by gc */ 12 + }; 13 + 8 14 struct nf_conncount_list { 15 + spinlock_t list_lock; 9 16 struct list_head head; /* connections with the same filtering key */ 10 17 unsigned int count; /* length of list */ 18 + bool dead; 11 19 }; 12 20 13 21 struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family, ··· 36 28 37 29 void nf_conncount_list_init(struct nf_conncount_list *list); 38 30 39 - bool nf_conncount_add(struct nf_conncount_list *list, 40 - const struct nf_conntrack_tuple *tuple, 41 - const struct nf_conntrack_zone *zone); 31 + enum nf_conncount_list_add 32 + nf_conncount_add(struct nf_conncount_list *list, 33 + const struct nf_conntrack_tuple *tuple, 34 + const struct nf_conntrack_zone *zone); 42 35 43 - void nf_conncount_gc_list(struct net *net, 36 + bool nf_conncount_gc_list(struct net *net, 44 37 struct nf_conncount_list *list); 45 38 46 39 void nf_conncount_cache_free(struct nf_conncount_list *list);
+181 -72
net/netfilter/nf_conncount.c
··· 49 49 struct nf_conntrack_zone zone; 50 50 int cpu; 51 51 u32 jiffies32; 52 + struct rcu_head rcu_head; 52 53 }; 53 54 54 55 struct nf_conncount_rb { 55 56 struct rb_node node; 56 57 struct nf_conncount_list list; 57 58 u32 key[MAX_KEYLEN]; 59 + struct rcu_head rcu_head; 58 60 }; 59 61 60 62 static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_in_smp; ··· 64 62 struct nf_conncount_data { 65 63 unsigned int keylen; 66 64 struct rb_root root[CONNCOUNT_SLOTS]; 65 + struct net *net; 66 + struct work_struct gc_work; 67 + unsigned long pending_trees[BITS_TO_LONGS(CONNCOUNT_SLOTS)]; 68 + unsigned int gc_tree; 67 69 }; 68 70 69 71 static u_int32_t conncount_rnd __read_mostly; ··· 88 82 return memcmp(a, b, klen * sizeof(u32)); 89 83 } 90 84 91 - bool nf_conncount_add(struct nf_conncount_list *list, 92 - const struct nf_conntrack_tuple *tuple, 93 - const struct nf_conntrack_zone *zone) 85 + enum nf_conncount_list_add 86 + nf_conncount_add(struct nf_conncount_list *list, 87 + const struct nf_conntrack_tuple *tuple, 88 + const struct nf_conntrack_zone *zone) 94 89 { 95 90 struct nf_conncount_tuple *conn; 96 91 97 92 if (WARN_ON_ONCE(list->count > INT_MAX)) 98 - return false; 93 + return NF_CONNCOUNT_ERR; 99 94 100 95 conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); 101 96 if (conn == NULL) 102 - return false; 97 + return NF_CONNCOUNT_ERR; 98 + 103 99 conn->tuple = *tuple; 104 100 conn->zone = *zone; 105 101 conn->cpu = raw_smp_processor_id(); 106 102 conn->jiffies32 = (u32)jiffies; 103 + spin_lock(&list->list_lock); 104 + if (list->dead == true) { 105 + kmem_cache_free(conncount_conn_cachep, conn); 106 + spin_unlock(&list->list_lock); 107 + return NF_CONNCOUNT_SKIP; 108 + } 107 109 list_add_tail(&conn->node, &list->head); 108 110 list->count++; 109 - return true; 111 + spin_unlock(&list->list_lock); 112 + return NF_CONNCOUNT_ADDED; 110 113 } 111 114 EXPORT_SYMBOL_GPL(nf_conncount_add); 112 115 113 - static void conn_free(struct nf_conncount_list *list, 116 + static void __conn_free(struct rcu_head *h) 117 + { 118 + struct nf_conncount_tuple *conn; 119 + 120 + conn = container_of(h, struct nf_conncount_tuple, rcu_head); 121 + kmem_cache_free(conncount_conn_cachep, conn); 122 + } 123 + 124 + static bool conn_free(struct nf_conncount_list *list, 114 125 struct nf_conncount_tuple *conn) 115 126 { 116 - if (WARN_ON_ONCE(list->count == 0)) 117 - return; 127 + bool free_entry = false; 128 + 129 + spin_lock(&list->list_lock); 130 + 131 + if (list->count == 0) { 132 + spin_unlock(&list->list_lock); 133 + return free_entry; 134 + } 118 135 119 136 list->count--; 120 - list_del(&conn->node); 121 - kmem_cache_free(conncount_conn_cachep, conn); 137 + list_del_rcu(&conn->node); 138 + if (list->count == 0) 139 + free_entry = true; 140 + 141 + spin_unlock(&list->list_lock); 142 + call_rcu(&conn->rcu_head, __conn_free); 143 + return free_entry; 122 144 } 123 145 124 146 static const struct nf_conntrack_tuple_hash * 125 147 find_or_evict(struct net *net, struct nf_conncount_list *list, 126 - struct nf_conncount_tuple *conn) 148 + struct nf_conncount_tuple *conn, bool *free_entry) 127 149 { 128 150 const struct nf_conntrack_tuple_hash *found; 129 151 unsigned long a, b; ··· 171 137 */ 172 138 age = a - b; 173 139 if (conn->cpu == cpu || age >= 2) { 174 - conn_free(list, conn); 140 + *free_entry = conn_free(list, conn); 175 141 return ERR_PTR(-ENOENT); 176 142 } 177 143 ··· 188 154 struct nf_conncount_tuple *conn, *conn_n; 189 155 struct nf_conn *found_ct; 190 156 unsigned int collect = 0; 157 + bool free_entry = false; 191 158 192 159 /* best effort only */ 193 160 *addit = tuple ? true : false; ··· 198 163 if (collect > CONNCOUNT_GC_MAX_NODES) 199 164 break; 200 165 201 - found = find_or_evict(net, list, conn); 166 + found = find_or_evict(net, list, conn, &free_entry); 202 167 if (IS_ERR(found)) { 203 168 /* Not found, but might be about to be confirmed */ 204 169 if (PTR_ERR(found) == -EAGAIN) { ··· 243 208 244 209 void nf_conncount_list_init(struct nf_conncount_list *list) 245 210 { 211 + spin_lock_init(&list->list_lock); 246 212 INIT_LIST_HEAD(&list->head); 247 213 list->count = 1; 214 + list->dead = false; 248 215 } 249 216 EXPORT_SYMBOL_GPL(nf_conncount_list_init); 250 217 251 - void nf_conncount_gc_list(struct net *net, 218 + /* Return true if the list is empty */ 219 + bool nf_conncount_gc_list(struct net *net, 252 220 struct nf_conncount_list *list) 253 221 { 254 222 const struct nf_conntrack_tuple_hash *found; 255 223 struct nf_conncount_tuple *conn, *conn_n; 256 224 struct nf_conn *found_ct; 257 225 unsigned int collected = 0; 226 + bool free_entry = false; 258 227 259 228 list_for_each_entry_safe(conn, conn_n, &list->head, node) { 260 - found = find_or_evict(net, list, conn); 229 + found = find_or_evict(net, list, conn, &free_entry); 261 230 if (IS_ERR(found)) { 262 - if (PTR_ERR(found) == -ENOENT) 231 + if (PTR_ERR(found) == -ENOENT) { 232 + if (free_entry) 233 + return true; 263 234 collected++; 235 + } 264 236 continue; 265 237 } 266 238 ··· 278 236 * closed already -> ditch it 279 237 */ 280 238 nf_ct_put(found_ct); 281 - conn_free(list, conn); 239 + if (conn_free(list, conn)) 240 + return true; 282 241 collected++; 283 242 continue; 284 243 } 285 244 286 245 nf_ct_put(found_ct); 287 246 if (collected > CONNCOUNT_GC_MAX_NODES) 288 - return; 247 + return false; 289 248 } 249 + return false; 290 250 } 291 251 EXPORT_SYMBOL_GPL(nf_conncount_gc_list); 252 + 253 + static void __tree_nodes_free(struct rcu_head *h) 254 + { 255 + struct nf_conncount_rb *rbconn; 256 + 257 + rbconn = container_of(h, struct nf_conncount_rb, rcu_head); 258 + kmem_cache_free(conncount_rb_cachep, rbconn); 259 + } 292 260 293 261 static void tree_nodes_free(struct rb_root *root, 294 262 struct nf_conncount_rb *gc_nodes[], ··· 308 256 309 257 while (gc_count) { 310 258 rbconn = gc_nodes[--gc_count]; 311 - rb_erase(&rbconn->node, root); 312 - kmem_cache_free(conncount_rb_cachep, rbconn); 259 + spin_lock(&rbconn->list.list_lock); 260 + if (rbconn->list.count == 0 && rbconn->list.dead == false) { 261 + rbconn->list.dead = true; 262 + rb_erase(&rbconn->node, root); 263 + call_rcu(&rbconn->rcu_head, __tree_nodes_free); 264 + } 265 + spin_unlock(&rbconn->list.list_lock); 313 266 } 314 267 } 315 268 269 + static void schedule_gc_worker(struct nf_conncount_data *data, int tree) 270 + { 271 + set_bit(tree, data->pending_trees); 272 + schedule_work(&data->gc_work); 273 + } 274 + 316 275 static unsigned int 317 - insert_tree(struct rb_root *root, 276 + insert_tree(struct net *net, 277 + struct nf_conncount_data *data, 278 + struct rb_root *root, 318 279 unsigned int hash, 319 280 const u32 *key, 320 281 u8 keylen, 321 282 const struct nf_conntrack_tuple *tuple, 322 283 const struct nf_conntrack_zone *zone) 323 284 { 285 + enum nf_conncount_list_add ret; 286 + struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; 324 287 struct rb_node **rbnode, *parent; 325 288 struct nf_conncount_rb *rbconn; 326 289 struct nf_conncount_tuple *conn; 327 - unsigned int count = 0; 290 + unsigned int count = 0, gc_count = 0; 291 + bool node_found = false; 328 292 329 293 spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); 330 294 ··· 358 290 rbnode = &((*rbnode)->rb_right); 359 291 } else { 360 292 /* unlikely: other cpu added node already */ 361 - if (!nf_conncount_add(&rbconn->list, tuple, zone)) { 293 + node_found = true; 294 + ret = nf_conncount_add(&rbconn->list, tuple, zone); 295 + if (ret == NF_CONNCOUNT_ERR) { 362 296 count = 0; /* hotdrop */ 363 - goto out_unlock; 297 + } else if (ret == NF_CONNCOUNT_ADDED) { 298 + count = rbconn->list.count; 299 + } else { 300 + /* NF_CONNCOUNT_SKIP, rbconn is already 301 + * reclaimed by gc, insert a new tree node 302 + */ 303 + node_found = false; 364 304 } 365 - 366 - count = rbconn->list.count; 367 - goto out_unlock; 305 + break; 368 306 } 307 + 308 + if (gc_count >= ARRAY_SIZE(gc_nodes)) 309 + continue; 310 + 311 + if (nf_conncount_gc_list(net, &rbconn->list)) 312 + gc_nodes[gc_count++] = rbconn; 369 313 } 314 + 315 + if (gc_count) { 316 + tree_nodes_free(root, gc_nodes, gc_count); 317 + /* tree_node_free before new allocation permits 318 + * allocator to re-use newly free'd object. 319 + * 320 + * This is a rare event; in most cases we will find 321 + * existing node to re-use. (or gc_count is 0). 322 + */ 323 + 324 + if (gc_count >= ARRAY_SIZE(gc_nodes)) 325 + schedule_gc_worker(data, hash); 326 + } 327 + 328 + if (node_found) 329 + goto out_unlock; 370 330 371 331 /* expected case: match, insert new node */ 372 332 rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); ··· 429 333 const struct nf_conntrack_tuple *tuple, 430 334 const struct nf_conntrack_zone *zone) 431 335 { 432 - struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; 336 + enum nf_conncount_list_add ret; 433 337 struct rb_root *root; 434 - struct rb_node **rbnode, *parent; 338 + struct rb_node *parent; 435 339 struct nf_conncount_rb *rbconn; 436 - unsigned int gc_count, hash; 437 - bool no_gc = false; 438 - unsigned int count = 0; 340 + unsigned int hash; 439 341 u8 keylen = data->keylen; 440 342 441 343 hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS; 442 344 root = &data->root[hash]; 443 345 444 - spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); 445 - restart: 446 - gc_count = 0; 447 - parent = NULL; 448 - rbnode = &(root->rb_node); 449 - while (*rbnode) { 346 + parent = rcu_dereference_raw(root->rb_node); 347 + while (parent) { 450 348 int diff; 451 349 bool addit; 452 350 453 - rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node); 351 + rbconn = rb_entry(parent, struct nf_conncount_rb, node); 454 352 455 - parent = *rbnode; 456 353 diff = key_diff(key, rbconn->key, keylen); 457 354 if (diff < 0) { 458 - rbnode = &((*rbnode)->rb_left); 355 + parent = rcu_dereference_raw(parent->rb_left); 459 356 } else if (diff > 0) { 460 - rbnode = &((*rbnode)->rb_right); 357 + parent = rcu_dereference_raw(parent->rb_right); 461 358 } else { 462 359 /* same source network -> be counted! */ 463 360 nf_conncount_lookup(net, &rbconn->list, tuple, zone, 464 361 &addit); 465 - count = rbconn->list.count; 466 362 467 - tree_nodes_free(root, gc_nodes, gc_count); 468 363 if (!addit) 469 - goto out_unlock; 364 + return rbconn->list.count; 470 365 471 - if (!nf_conncount_add(&rbconn->list, tuple, zone)) 472 - count = 0; /* hotdrop */ 473 - goto out_unlock; 474 - 475 - count++; 476 - goto out_unlock; 366 + ret = nf_conncount_add(&rbconn->list, tuple, zone); 367 + if (ret == NF_CONNCOUNT_ERR) { 368 + return 0; /* hotdrop */ 369 + } else if (ret == NF_CONNCOUNT_ADDED) { 370 + return rbconn->list.count; 371 + } else { 372 + /* NF_CONNCOUNT_SKIP, rbconn is already 373 + * reclaimed by gc, insert a new tree node 374 + */ 375 + break; 376 + } 477 377 } 378 + } 478 379 479 - if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes)) 480 - continue; 380 + if (!tuple) 381 + return 0; 481 382 482 - nf_conncount_gc_list(net, &rbconn->list); 483 - if (list_empty(&rbconn->list.head)) 383 + return insert_tree(net, data, root, hash, key, keylen, tuple, zone); 384 + } 385 + 386 + static void tree_gc_worker(struct work_struct *work) 387 + { 388 + struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work); 389 + struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn; 390 + struct rb_root *root; 391 + struct rb_node *node; 392 + unsigned int tree, next_tree, gc_count = 0; 393 + 394 + tree = data->gc_tree % CONNCOUNT_LOCK_SLOTS; 395 + root = &data->root[tree]; 396 + 397 + rcu_read_lock(); 398 + for (node = rb_first(root); node != NULL; node = rb_next(node)) { 399 + rbconn = rb_entry(node, struct nf_conncount_rb, node); 400 + if (nf_conncount_gc_list(data->net, &rbconn->list)) 484 401 gc_nodes[gc_count++] = rbconn; 485 402 } 403 + rcu_read_unlock(); 404 + 405 + spin_lock_bh(&nf_conncount_locks[tree]); 486 406 487 407 if (gc_count) { 488 - no_gc = true; 489 408 tree_nodes_free(root, gc_nodes, gc_count); 490 - /* tree_node_free before new allocation permits 491 - * allocator to re-use newly free'd object. 492 - * 493 - * This is a rare event; in most cases we will find 494 - * existing node to re-use. (or gc_count is 0). 495 - */ 496 - goto restart; 497 409 } 498 410 499 - count = 0; 500 - if (!tuple) 501 - goto out_unlock; 411 + clear_bit(tree, data->pending_trees); 502 412 503 - spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); 504 - return insert_tree(root, hash, key, keylen, tuple, zone); 413 + next_tree = (tree + 1) % CONNCOUNT_SLOTS; 414 + next_tree = find_next_bit(data->pending_trees, next_tree, CONNCOUNT_SLOTS); 505 415 506 - out_unlock: 507 - spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); 508 - return count; 416 + if (next_tree < CONNCOUNT_SLOTS) { 417 + data->gc_tree = next_tree; 418 + schedule_work(work); 419 + } 420 + 421 + spin_unlock_bh(&nf_conncount_locks[tree]); 509 422 } 510 423 511 424 /* Count and return number of conntrack entries in 'net' with particular 'key'. 512 425 * If 'tuple' is not null, insert it into the accounting data structure. 426 + * Call with RCU read lock. 513 427 */ 514 428 unsigned int nf_conncount_count(struct net *net, 515 429 struct nf_conncount_data *data, ··· 558 452 data->root[i] = RB_ROOT; 559 453 560 454 data->keylen = keylen / sizeof(u32); 455 + data->net = net; 456 + INIT_WORK(&data->gc_work, tree_gc_worker); 561 457 562 458 return data; 563 459 } ··· 595 487 { 596 488 unsigned int i; 597 489 490 + cancel_work_sync(&data->gc_work); 598 491 nf_ct_netns_put(net, family); 599 492 600 493 for (i = 0; i < ARRAY_SIZE(data->root); ++i)
+2 -15
net/netfilter/nft_connlimit.c
··· 14 14 #include <net/netfilter/nf_conntrack_zones.h> 15 15 16 16 struct nft_connlimit { 17 - spinlock_t lock; 18 17 struct nf_conncount_list list; 19 18 u32 limit; 20 19 bool invert; ··· 44 45 return; 45 46 } 46 47 47 - spin_lock_bh(&priv->lock); 48 48 nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone, 49 49 &addit); 50 50 count = priv->list.count; ··· 51 53 if (!addit) 52 54 goto out; 53 55 54 - if (!nf_conncount_add(&priv->list, tuple_ptr, zone)) { 56 + if (nf_conncount_add(&priv->list, tuple_ptr, zone) == NF_CONNCOUNT_ERR) { 55 57 regs->verdict.code = NF_DROP; 56 - spin_unlock_bh(&priv->lock); 57 58 return; 58 59 } 59 60 count++; 60 61 out: 61 - spin_unlock_bh(&priv->lock); 62 62 63 63 if ((count > priv->limit) ^ priv->invert) { 64 64 regs->verdict.code = NFT_BREAK; ··· 84 88 invert = true; 85 89 } 86 90 87 - spin_lock_init(&priv->lock); 88 91 nf_conncount_list_init(&priv->list); 89 92 priv->limit = limit; 90 93 priv->invert = invert; ··· 208 213 struct nft_connlimit *priv_dst = nft_expr_priv(dst); 209 214 struct nft_connlimit *priv_src = nft_expr_priv(src); 210 215 211 - spin_lock_init(&priv_dst->lock); 212 216 nf_conncount_list_init(&priv_dst->list); 213 217 priv_dst->limit = priv_src->limit; 214 218 priv_dst->invert = priv_src->invert; ··· 226 232 static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) 227 233 { 228 234 struct nft_connlimit *priv = nft_expr_priv(expr); 229 - bool ret; 230 235 231 - spin_lock_bh(&priv->lock); 232 - nf_conncount_gc_list(net, &priv->list); 233 - 234 - ret = list_empty(&priv->list.head); 235 - spin_unlock_bh(&priv->lock); 236 - 237 - return ret; 236 + return nf_conncount_gc_list(net, &priv->list); 238 237 } 239 238 240 239 static struct nft_expr_type nft_connlimit_type;