Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: nf_conntrack: don't release a conntrack with non-zero refcnt

With this patch, the conntrack refcount is initially set to zero and
it is bumped once it is added to any of the list, so we fulfill
Eric's golden rule which is that all released objects always have a
refcount that equals zero.

Andrey Vagin reports that nf_conntrack_free can't be called for a
conntrack with non-zero ref-counter, because it can race with
nf_conntrack_find_get().

A conntrack slab is created with SLAB_DESTROY_BY_RCU. Non-zero
ref-counter says that this conntrack is used. So when we release
a conntrack with non-zero counter, we break this assumption.

CPU1 CPU2
____nf_conntrack_find()
nf_ct_put()
destroy_conntrack()
...
init_conntrack
__nf_conntrack_alloc (set use = 1)
atomic_inc_not_zero(&ct->use) (use = 2)
if (!l4proto->new(ct, skb, dataoff, timeouts))
nf_conntrack_free(ct); (use = 2 !!!)
...
__nf_conntrack_alloc (set use = 1)
if (!nf_ct_key_equal(h, tuple, zone))
nf_ct_put(ct); (use = 0)
destroy_conntrack()
/* continue to work with CT */

After applying the path "[PATCH] netfilter: nf_conntrack: fix RCU
race in nf_conntrack_find_get" another bug was triggered in
destroy_conntrack():

<4>[67096.759334] ------------[ cut here ]------------
<2>[67096.759353] kernel BUG at net/netfilter/nf_conntrack_core.c:211!
...
<4>[67096.759837] Pid: 498649, comm: atdd veid: 666 Tainted: G C --------------- 2.6.32-042stab084.18 #1 042stab084_18 /DQ45CB
<4>[67096.759932] RIP: 0010:[<ffffffffa03d99ac>] [<ffffffffa03d99ac>] destroy_conntrack+0x15c/0x190 [nf_conntrack]
<4>[67096.760255] Call Trace:
<4>[67096.760255] [<ffffffff814844a7>] nf_conntrack_destroy+0x17/0x30
<4>[67096.760255] [<ffffffffa03d9bb5>] nf_conntrack_find_get+0x85/0x130 [nf_conntrack]
<4>[67096.760255] [<ffffffffa03d9fb2>] nf_conntrack_in+0x352/0xb60 [nf_conntrack]
<4>[67096.760255] [<ffffffffa048c771>] ipv4_conntrack_local+0x51/0x60 [nf_conntrack_ipv4]
<4>[67096.760255] [<ffffffff81484419>] nf_iterate+0x69/0xb0
<4>[67096.760255] [<ffffffff814b5b00>] ? dst_output+0x0/0x20
<4>[67096.760255] [<ffffffff814845d4>] nf_hook_slow+0x74/0x110
<4>[67096.760255] [<ffffffff814b5b00>] ? dst_output+0x0/0x20
<4>[67096.760255] [<ffffffff814b66d5>] raw_sendmsg+0x775/0x910
<4>[67096.760255] [<ffffffff8104c5a8>] ? flush_tlb_others_ipi+0x128/0x130
<4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20
<4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20
<4>[67096.760255] [<ffffffff814c136a>] inet_sendmsg+0x4a/0xb0
<4>[67096.760255] [<ffffffff81444e93>] ? sock_sendmsg+0x13/0x140
<4>[67096.760255] [<ffffffff81444f97>] sock_sendmsg+0x117/0x140
<4>[67096.760255] [<ffffffff8102e299>] ? native_smp_send_reschedule+0x49/0x60
<4>[67096.760255] [<ffffffff81519beb>] ? _spin_unlock_bh+0x1b/0x20
<4>[67096.760255] [<ffffffff8109d930>] ? autoremove_wake_function+0x0/0x40
<4>[67096.760255] [<ffffffff814960f0>] ? do_ip_setsockopt+0x90/0xd80
<4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20
<4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20
<4>[67096.760255] [<ffffffff814457c9>] sys_sendto+0x139/0x190
<4>[67096.760255] [<ffffffff810efa77>] ? audit_syscall_entry+0x1d7/0x200
<4>[67096.760255] [<ffffffff810ef7c5>] ? __audit_syscall_exit+0x265/0x290
<4>[67096.760255] [<ffffffff81474daf>] compat_sys_socketcall+0x13f/0x210
<4>[67096.760255] [<ffffffff8104dea3>] ia32_sysret+0x0/0x5

I have reused the original title for the RFC patch that Andrey posted and
most of the original patch description.

Cc: Eric Dumazet <edumazet@google.com>
Cc: Andrew Vagin <avagin@parallels.com>
Cc: Florian Westphal <fw@strlen.de>
Reported-by: Andrew Vagin <avagin@parallels.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Andrew Vagin <avagin@parallels.com>

+34 -14
+2
include/net/netfilter/nf_conntrack.h
··· 284 284 extern unsigned int nf_conntrack_hash_rnd; 285 285 void init_nf_conntrack_hash_rnd(void); 286 286 287 + void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl); 288 + 287 289 #define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count) 288 290 #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count) 289 291
+29 -5
net/netfilter/nf_conntrack_core.c
··· 448 448 goto out; 449 449 450 450 add_timer(&ct->timeout); 451 - nf_conntrack_get(&ct->ct_general); 451 + smp_wmb(); 452 + /* The caller holds a reference to this object */ 453 + atomic_set(&ct->ct_general.use, 2); 452 454 __nf_conntrack_hash_insert(ct, hash, repl_hash); 453 455 NF_CT_STAT_INC(net, insert); 454 456 spin_unlock_bh(&nf_conntrack_lock); ··· 463 461 return -EEXIST; 464 462 } 465 463 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); 464 + 465 + /* deletion from this larval template list happens via nf_ct_put() */ 466 + void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl) 467 + { 468 + __set_bit(IPS_TEMPLATE_BIT, &tmpl->status); 469 + __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); 470 + nf_conntrack_get(&tmpl->ct_general); 471 + 472 + spin_lock_bh(&nf_conntrack_lock); 473 + /* Overload tuple linked list to put us in template list. */ 474 + hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 475 + &net->ct.tmpl); 476 + spin_unlock_bh(&nf_conntrack_lock); 477 + } 478 + EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert); 466 479 467 480 /* Confirm a connection given skb; places it in hash table */ 468 481 int ··· 750 733 nf_ct_zone->id = zone; 751 734 } 752 735 #endif 753 - /* 754 - * changes to lookup keys must be done before setting refcnt to 1 736 + /* Because we use RCU lookups, we set ct_general.use to zero before 737 + * this is inserted in any list. 755 738 */ 756 - smp_wmb(); 757 - atomic_set(&ct->ct_general.use, 1); 739 + atomic_set(&ct->ct_general.use, 0); 758 740 return ct; 759 741 760 742 #ifdef CONFIG_NF_CONNTRACK_ZONES ··· 776 760 void nf_conntrack_free(struct nf_conn *ct) 777 761 { 778 762 struct net *net = nf_ct_net(ct); 763 + 764 + /* A freed object has refcnt == 0, that's 765 + * the golden rule for SLAB_DESTROY_BY_RCU 766 + */ 767 + NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0); 779 768 780 769 nf_ct_ext_destroy(ct); 781 770 nf_ct_ext_free(ct); ··· 876 855 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); 877 856 NF_CT_STAT_INC(net, new); 878 857 } 858 + 859 + /* Now it is inserted into the unconfirmed list, bump refcount */ 860 + nf_conntrack_get(&ct->ct_general); 879 861 880 862 /* Overload tuple linked list to put us in unconfirmed list. */ 881 863 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+2 -3
net/netfilter/nf_synproxy_core.c
··· 363 363 goto err2; 364 364 if (!nfct_synproxy_ext_add(ct)) 365 365 goto err2; 366 - __set_bit(IPS_TEMPLATE_BIT, &ct->status); 367 - __set_bit(IPS_CONFIRMED_BIT, &ct->status); 368 366 367 + nf_conntrack_tmpl_insert(net, ct); 369 368 snet->tmpl = ct; 370 369 371 370 snet->stats = alloc_percpu(struct synproxy_stats); ··· 389 390 { 390 391 struct synproxy_net *snet = synproxy_pernet(net); 391 392 392 - nf_conntrack_free(snet->tmpl); 393 + nf_ct_put(snet->tmpl); 393 394 synproxy_proc_exit(net); 394 395 free_percpu(snet->stats); 395 396 }
+1 -6
net/netfilter/xt_CT.c
··· 228 228 goto err3; 229 229 } 230 230 231 - __set_bit(IPS_TEMPLATE_BIT, &ct->status); 232 - __set_bit(IPS_CONFIRMED_BIT, &ct->status); 233 - 234 - /* Overload tuple linked list to put us in template list. */ 235 - hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 236 - &par->net->ct.tmpl); 231 + nf_conntrack_tmpl_insert(par->net, ct); 237 232 out: 238 233 info->ct = ct; 239 234 return 0;