Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: nf_conncount: rework API to use sk_buff directly

When using nf_conncount infrastructure for non-confirmed connections a
duplicated track is possible due to an optimization introduced since
commit d265929930e2 ("netfilter: nf_conncount: reduce unnecessary GC").

In order to fix this introduce a new conncount API that receives
directly an sk_buff struct. It fetches the tuple and zone and the
corresponding ct from it. It comes with both existing conncount variants
nf_conncount_count_skb() and nf_conncount_add_skb(). In addition remove
the old API and adjust all the users to use the new one.

This way, for each sk_buff struct it is possible to check if there is a
ct present and already confirmed. If so, skip the add operation.

Fixes: d265929930e2 ("netfilter: nf_conncount: reduce unnecessary GC")
Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

authored by

Fernando Fernandez Mancera and committed by
Pablo Neira Ayuso
be102eb6 fe831331

+145 -106
+7 -8
include/net/netfilter/nf_conntrack_count.h
··· 18 18 struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen); 19 19 void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data); 20 20 21 - unsigned int nf_conncount_count(struct net *net, 22 - struct nf_conncount_data *data, 23 - const u32 *key, 24 - const struct nf_conntrack_tuple *tuple, 25 - const struct nf_conntrack_zone *zone); 21 + unsigned int nf_conncount_count_skb(struct net *net, 22 + const struct sk_buff *skb, 23 + u16 l3num, 24 + struct nf_conncount_data *data, 25 + const u32 *key); 26 26 27 - int nf_conncount_add(struct net *net, struct nf_conncount_list *list, 28 - const struct nf_conntrack_tuple *tuple, 29 - const struct nf_conntrack_zone *zone); 27 + int nf_conncount_add_skb(struct net *net, const struct sk_buff *skb, 28 + u16 l3num, struct nf_conncount_list *list); 30 29 31 30 void nf_conncount_list_init(struct nf_conncount_list *list); 32 31
+124 -61
net/netfilter/nf_conncount.c
··· 122 122 return ERR_PTR(-EAGAIN); 123 123 } 124 124 125 - static int __nf_conncount_add(struct net *net, 126 - struct nf_conncount_list *list, 127 - const struct nf_conntrack_tuple *tuple, 128 - const struct nf_conntrack_zone *zone) 125 + static bool get_ct_or_tuple_from_skb(struct net *net, 126 + const struct sk_buff *skb, 127 + u16 l3num, 128 + struct nf_conn **ct, 129 + struct nf_conntrack_tuple *tuple, 130 + const struct nf_conntrack_zone **zone, 131 + bool *refcounted) 129 132 { 133 + const struct nf_conntrack_tuple_hash *h; 134 + enum ip_conntrack_info ctinfo; 135 + struct nf_conn *found_ct; 136 + 137 + found_ct = nf_ct_get(skb, &ctinfo); 138 + if (found_ct && !nf_ct_is_template(found_ct)) { 139 + *tuple = found_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 140 + *zone = nf_ct_zone(found_ct); 141 + *ct = found_ct; 142 + return true; 143 + } 144 + 145 + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num, net, tuple)) 146 + return false; 147 + 148 + if (found_ct) 149 + *zone = nf_ct_zone(found_ct); 150 + 151 + h = nf_conntrack_find_get(net, *zone, tuple); 152 + if (!h) 153 + return true; 154 + 155 + found_ct = nf_ct_tuplehash_to_ctrack(h); 156 + *refcounted = true; 157 + *ct = found_ct; 158 + 159 + return true; 160 + } 161 + 162 + static int __nf_conncount_add(struct net *net, 163 + const struct sk_buff *skb, 164 + u16 l3num, 165 + struct nf_conncount_list *list) 166 + { 167 + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; 130 168 const struct nf_conntrack_tuple_hash *found; 131 169 struct nf_conncount_tuple *conn, *conn_n; 170 + struct nf_conntrack_tuple tuple; 171 + struct nf_conn *ct = NULL; 132 172 struct nf_conn *found_ct; 133 173 unsigned int collect = 0; 174 + bool refcounted = false; 175 + 176 + if (!get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) 177 + return -ENOENT; 178 + 179 + if (ct && nf_ct_is_confirmed(ct)) { 180 + if (refcounted) 181 + nf_ct_put(ct); 182 + return 0; 183 + } 134 184 135 185 if ((u32)jiffies == list->last_gc) 136 186 goto add_new_node; ··· 194 144 if (IS_ERR(found)) { 195 145 /* Not found, but might be about to be confirmed */ 196 146 if (PTR_ERR(found) == -EAGAIN) { 197 - if (nf_ct_tuple_equal(&conn->tuple, tuple) && 147 + if (nf_ct_tuple_equal(&conn->tuple, &tuple) && 198 148 nf_ct_zone_id(&conn->zone, conn->zone.dir) == 199 149 nf_ct_zone_id(zone, zone->dir)) 200 - return 0; /* already exists */ 150 + goto out_put; /* already exists */ 201 151 } else { 202 152 collect++; 203 153 } ··· 206 156 207 157 found_ct = nf_ct_tuplehash_to_ctrack(found); 208 158 209 - if (nf_ct_tuple_equal(&conn->tuple, tuple) && 159 + if (nf_ct_tuple_equal(&conn->tuple, &tuple) && 210 160 nf_ct_zone_equal(found_ct, zone, zone->dir)) { 211 161 /* 212 162 * We should not see tuples twice unless someone hooks ··· 215 165 * Attempt to avoid a re-add in this case. 216 166 */ 217 167 nf_ct_put(found_ct); 218 - return 0; 168 + goto out_put; 219 169 } else if (already_closed(found_ct)) { 220 170 /* 221 171 * we do not care about connections which are ··· 238 188 if (conn == NULL) 239 189 return -ENOMEM; 240 190 241 - conn->tuple = *tuple; 191 + conn->tuple = tuple; 242 192 conn->zone = *zone; 243 193 conn->cpu = raw_smp_processor_id(); 244 194 conn->jiffies32 = (u32)jiffies; 245 195 list_add_tail(&conn->node, &list->head); 246 196 list->count++; 247 197 list->last_gc = (u32)jiffies; 198 + 199 + out_put: 200 + if (refcounted) 201 + nf_ct_put(ct); 248 202 return 0; 249 203 } 250 204 251 - int nf_conncount_add(struct net *net, 252 - struct nf_conncount_list *list, 253 - const struct nf_conntrack_tuple *tuple, 254 - const struct nf_conntrack_zone *zone) 205 + int nf_conncount_add_skb(struct net *net, 206 + const struct sk_buff *skb, 207 + u16 l3num, 208 + struct nf_conncount_list *list) 255 209 { 256 210 int ret; 257 211 258 212 /* check the saved connections */ 259 213 spin_lock_bh(&list->list_lock); 260 - ret = __nf_conncount_add(net, list, tuple, zone); 214 + ret = __nf_conncount_add(net, skb, l3num, list); 261 215 spin_unlock_bh(&list->list_lock); 262 216 263 217 return ret; 264 218 } 265 - EXPORT_SYMBOL_GPL(nf_conncount_add); 219 + EXPORT_SYMBOL_GPL(nf_conncount_add_skb); 266 220 267 221 void nf_conncount_list_init(struct nf_conncount_list *list) 268 222 { ··· 363 309 364 310 static unsigned int 365 311 insert_tree(struct net *net, 312 + const struct sk_buff *skb, 313 + u16 l3num, 366 314 struct nf_conncount_data *data, 367 315 struct rb_root *root, 368 316 unsigned int hash, 369 - const u32 *key, 370 - const struct nf_conntrack_tuple *tuple, 371 - const struct nf_conntrack_zone *zone) 317 + const u32 *key) 372 318 { 373 319 struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; 374 - struct rb_node **rbnode, *parent; 375 - struct nf_conncount_rb *rbconn; 376 - struct nf_conncount_tuple *conn; 320 + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; 321 + bool do_gc = true, refcounted = false; 377 322 unsigned int count = 0, gc_count = 0; 378 - bool do_gc = true; 323 + struct rb_node **rbnode, *parent; 324 + struct nf_conntrack_tuple tuple; 325 + struct nf_conncount_tuple *conn; 326 + struct nf_conncount_rb *rbconn; 327 + struct nf_conn *ct = NULL; 379 328 380 329 spin_lock_bh(&nf_conncount_locks[hash]); 381 330 restart: ··· 397 340 } else { 398 341 int ret; 399 342 400 - ret = nf_conncount_add(net, &rbconn->list, tuple, zone); 343 + ret = nf_conncount_add_skb(net, skb, l3num, &rbconn->list); 401 344 if (ret) 402 345 count = 0; /* hotdrop */ 403 346 else ··· 421 364 goto restart; 422 365 } 423 366 424 - /* expected case: match, insert new node */ 425 - rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); 426 - if (rbconn == NULL) 427 - goto out_unlock; 367 + if (get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) { 368 + /* expected case: match, insert new node */ 369 + rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); 370 + if (rbconn == NULL) 371 + goto out_unlock; 428 372 429 - conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); 430 - if (conn == NULL) { 431 - kmem_cache_free(conncount_rb_cachep, rbconn); 432 - goto out_unlock; 373 + conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); 374 + if (conn == NULL) { 375 + kmem_cache_free(conncount_rb_cachep, rbconn); 376 + goto out_unlock; 377 + } 378 + 379 + conn->tuple = tuple; 380 + conn->zone = *zone; 381 + conn->cpu = raw_smp_processor_id(); 382 + conn->jiffies32 = (u32)jiffies; 383 + memcpy(rbconn->key, key, sizeof(u32) * data->keylen); 384 + 385 + nf_conncount_list_init(&rbconn->list); 386 + list_add(&conn->node, &rbconn->list.head); 387 + count = 1; 388 + rbconn->list.count = count; 389 + 390 + rb_link_node_rcu(&rbconn->node, parent, rbnode); 391 + rb_insert_color(&rbconn->node, root); 392 + 393 + if (refcounted) 394 + nf_ct_put(ct); 433 395 } 434 - 435 - conn->tuple = *tuple; 436 - conn->zone = *zone; 437 - conn->cpu = raw_smp_processor_id(); 438 - conn->jiffies32 = (u32)jiffies; 439 - memcpy(rbconn->key, key, sizeof(u32) * data->keylen); 440 - 441 - nf_conncount_list_init(&rbconn->list); 442 - list_add(&conn->node, &rbconn->list.head); 443 - count = 1; 444 - rbconn->list.count = count; 445 - 446 - rb_link_node_rcu(&rbconn->node, parent, rbnode); 447 - rb_insert_color(&rbconn->node, root); 448 396 out_unlock: 449 397 spin_unlock_bh(&nf_conncount_locks[hash]); 450 398 return count; ··· 457 395 458 396 static unsigned int 459 397 count_tree(struct net *net, 398 + const struct sk_buff *skb, 399 + u16 l3num, 460 400 struct nf_conncount_data *data, 461 - const u32 *key, 462 - const struct nf_conntrack_tuple *tuple, 463 - const struct nf_conntrack_zone *zone) 401 + const u32 *key) 464 402 { 465 403 struct rb_root *root; 466 404 struct rb_node *parent; ··· 484 422 } else { 485 423 int ret; 486 424 487 - if (!tuple) { 425 + if (!skb) { 488 426 nf_conncount_gc_list(net, &rbconn->list); 489 427 return rbconn->list.count; 490 428 } ··· 499 437 } 500 438 501 439 /* same source network -> be counted! */ 502 - ret = __nf_conncount_add(net, &rbconn->list, tuple, zone); 440 + ret = __nf_conncount_add(net, skb, l3num, &rbconn->list); 503 441 spin_unlock_bh(&rbconn->list.list_lock); 504 442 if (ret) 505 443 return 0; /* hotdrop */ ··· 508 446 } 509 447 } 510 448 511 - if (!tuple) 449 + if (!skb) 512 450 return 0; 513 451 514 - return insert_tree(net, data, root, hash, key, tuple, zone); 452 + return insert_tree(net, skb, l3num, data, root, hash, key); 515 453 } 516 454 517 455 static void tree_gc_worker(struct work_struct *work) ··· 573 511 } 574 512 575 513 /* Count and return number of conntrack entries in 'net' with particular 'key'. 576 - * If 'tuple' is not null, insert it into the accounting data structure. 577 - * Call with RCU read lock. 514 + * If 'skb' is not null, insert the corresponding tuple into the accounting 515 + * data structure. Call with RCU read lock. 578 516 */ 579 - unsigned int nf_conncount_count(struct net *net, 580 - struct nf_conncount_data *data, 581 - const u32 *key, 582 - const struct nf_conntrack_tuple *tuple, 583 - const struct nf_conntrack_zone *zone) 517 + unsigned int nf_conncount_count_skb(struct net *net, 518 + const struct sk_buff *skb, 519 + u16 l3num, 520 + struct nf_conncount_data *data, 521 + const u32 *key) 584 522 { 585 - return count_tree(net, data, key, tuple, zone); 523 + return count_tree(net, skb, l3num, data, key); 524 + 586 525 } 587 - EXPORT_SYMBOL_GPL(nf_conncount_count); 526 + EXPORT_SYMBOL_GPL(nf_conncount_count_skb); 588 527 589 528 struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen) 590 529 {
+3 -18
net/netfilter/nft_connlimit.c
··· 24 24 const struct nft_pktinfo *pkt, 25 25 const struct nft_set_ext *ext) 26 26 { 27 - const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; 28 - const struct nf_conntrack_tuple *tuple_ptr; 29 - struct nf_conntrack_tuple tuple; 30 - enum ip_conntrack_info ctinfo; 31 - const struct nf_conn *ct; 32 27 unsigned int count; 28 + int err; 33 29 34 - tuple_ptr = &tuple; 35 - 36 - ct = nf_ct_get(pkt->skb, &ctinfo); 37 - if (ct != NULL) { 38 - tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 39 - zone = nf_ct_zone(ct); 40 - } else if (!nf_ct_get_tuplepr(pkt->skb, skb_network_offset(pkt->skb), 41 - nft_pf(pkt), nft_net(pkt), &tuple)) { 42 - regs->verdict.code = NF_DROP; 43 - return; 44 - } 45 - 46 - if (nf_conncount_add(nft_net(pkt), priv->list, tuple_ptr, zone)) { 30 + err = nf_conncount_add_skb(nft_net(pkt), pkt->skb, nft_pf(pkt), priv->list); 31 + if (err) { 47 32 regs->verdict.code = NF_DROP; 48 33 return; 49 34 }
+3 -11
net/netfilter/xt_connlimit.c
··· 31 31 { 32 32 struct net *net = xt_net(par); 33 33 const struct xt_connlimit_info *info = par->matchinfo; 34 - struct nf_conntrack_tuple tuple; 35 - const struct nf_conntrack_tuple *tuple_ptr = &tuple; 36 34 const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; 37 35 enum ip_conntrack_info ctinfo; 38 36 const struct nf_conn *ct; ··· 38 40 u32 key[5]; 39 41 40 42 ct = nf_ct_get(skb, &ctinfo); 41 - if (ct != NULL) { 42 - tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 43 + if (ct) 43 44 zone = nf_ct_zone(ct); 44 - } else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), 45 - xt_family(par), net, &tuple)) { 46 - goto hotdrop; 47 - } 48 45 49 46 if (xt_family(par) == NFPROTO_IPV6) { 50 47 const struct ipv6hdr *iph = ipv6_hdr(skb); ··· 62 69 key[1] = zone->id; 63 70 } 64 71 65 - connections = nf_conncount_count(net, info->data, key, tuple_ptr, 66 - zone); 72 + connections = nf_conncount_count_skb(net, skb, xt_family(par), info->data, key); 67 73 if (connections == 0) 68 - /* kmalloc failed, drop it entirely */ 74 + /* kmalloc failed or tuple couldn't be found, drop it entirely */ 69 75 goto hotdrop; 70 76 71 77 return (connections > info->limit) ^ !!(info->flags & XT_CONNLIMIT_INVERT);
+8 -8
net/openvswitch/conntrack.c
··· 928 928 } 929 929 930 930 static int ovs_ct_check_limit(struct net *net, 931 - const struct ovs_conntrack_info *info, 932 - const struct nf_conntrack_tuple *tuple) 931 + const struct sk_buff *skb, 932 + const struct ovs_conntrack_info *info) 933 933 { 934 934 struct ovs_net *ovs_net = net_generic(net, ovs_net_id); 935 935 const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; ··· 942 942 if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED) 943 943 return 0; 944 944 945 - connections = nf_conncount_count(net, ct_limit_info->data, 946 - &conncount_key, tuple, &info->zone); 945 + connections = nf_conncount_count_skb(net, skb, info->family, 946 + ct_limit_info->data, 947 + &conncount_key); 947 948 if (connections > per_zone_limit) 948 949 return -ENOMEM; 949 950 ··· 973 972 #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) 974 973 if (static_branch_unlikely(&ovs_ct_limit_enabled)) { 975 974 if (!nf_ct_is_confirmed(ct)) { 976 - err = ovs_ct_check_limit(net, info, 977 - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 975 + err = ovs_ct_check_limit(net, skb, info); 978 976 if (err) { 979 977 net_warn_ratelimited("openvswitch: zone: %u " 980 978 "exceeds conntrack limit\n", ··· 1770 1770 zone_limit.limit = limit; 1771 1771 nf_ct_zone_init(&ct_zone, zone_id, NF_CT_DEFAULT_ZONE_DIR, 0); 1772 1772 1773 - zone_limit.count = nf_conncount_count(net, data, &conncount_key, NULL, 1774 - &ct_zone); 1773 + zone_limit.count = nf_conncount_count_skb(net, NULL, 0, data, 1774 + &conncount_key); 1775 1775 return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit); 1776 1776 } 1777 1777