Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: ctnetlink: ensure safe access to master conntrack

Holding reference on the expectation is not sufficient, the master
conntrack object can just go away, making exp->master invalid.

To access exp->master safely:

- Grab the nf_conntrack_expect_lock, this gets serialized with
clean_from_lists() which also holds this lock when the master
conntrack goes away.

- Hold reference on master conntrack via nf_conntrack_find_get().
Not so easy since the master tuple to look up for the master conntrack
is not available in the existing problematic paths.

This patch goes for extending the nf_conntrack_expect_lock section
to address this issue for simplicity, in the cases that are described
below this is just slightly extending the lock section.

The add expectation command already holds a reference to the master
conntrack from ctnetlink_create_expect().

However, the delete expectation command needs to grab the spinlock
before looking up for the expectation. Expand the existing spinlock
section to address this to cover the expectation lookup. Note that,
the nf_ct_expect_iterate_net() calls already grabs the spinlock while
iterating over the expectation table, which is correct.

The get expectation command needs to grab the spinlock to ensure master
conntrack does not go away. This also expands the existing spinlock
section to cover the expectation lookup too. I needed to move the
netlink skb allocation out of the spinlock to keep it GFP_KERNEL.

For the expectation events, the IPEXP_DESTROY event is already delivered
under the spinlock, just move the delivery of IPEXP_NEW under the
spinlock too because the master conntrack event cache is reached through
exp->master.

While at it, add lockdep notations to help identify what codepaths need
to grab the spinlock.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

+35 -10
+5
include/net/netfilter/nf_conntrack_core.h
··· 83 83 84 84 extern spinlock_t nf_conntrack_expect_lock; 85 85 86 + static inline void lockdep_nfct_expect_lock_held(void) 87 + { 88 + lockdep_assert_held(&nf_conntrack_expect_lock); 89 + } 90 + 86 91 /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ 87 92 88 93 static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout)
+2
net/netfilter/nf_conntrack_ecache.c
··· 247 247 struct nf_ct_event_notifier *notify; 248 248 struct nf_conntrack_ecache *e; 249 249 250 + lockdep_nfct_expect_lock_held(); 251 + 250 252 rcu_read_lock(); 251 253 notify = rcu_dereference(net->ct.nf_conntrack_event_cb); 252 254 if (!notify)
+9 -1
net/netfilter/nf_conntrack_expect.c
··· 51 51 struct net *net = nf_ct_exp_net(exp); 52 52 struct nf_conntrack_net *cnet; 53 53 54 + lockdep_nfct_expect_lock_held(); 54 55 WARN_ON(!master_help); 55 56 WARN_ON(timer_pending(&exp->timeout)); 56 57 ··· 119 118 120 119 bool nf_ct_remove_expect(struct nf_conntrack_expect *exp) 121 120 { 121 + lockdep_nfct_expect_lock_held(); 122 + 122 123 if (timer_delete(&exp->timeout)) { 123 124 nf_ct_unlink_expect(exp); 124 125 nf_ct_expect_put(exp); ··· 179 176 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 180 177 struct nf_conntrack_expect *i, *exp = NULL; 181 178 unsigned int h; 179 + 180 + lockdep_nfct_expect_lock_held(); 182 181 183 182 if (!cnet->expect_count) 184 183 return NULL; ··· 459 454 unsigned int h; 460 455 int ret = 0; 461 456 457 + lockdep_nfct_expect_lock_held(); 458 + 462 459 if (!master_help) { 463 460 ret = -ESHUTDOWN; 464 461 goto out; ··· 517 510 518 511 nf_ct_expect_insert(expect); 519 512 520 - spin_unlock_bh(&nf_conntrack_expect_lock); 521 513 nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); 514 + spin_unlock_bh(&nf_conntrack_expect_lock); 515 + 522 516 return 0; 523 517 out: 524 518 spin_unlock_bh(&nf_conntrack_expect_lock);
+19 -9
net/netfilter/nf_conntrack_netlink.c
··· 3355 3355 if (err < 0) 3356 3356 return err; 3357 3357 3358 + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 3359 + if (!skb2) 3360 + return -ENOMEM; 3361 + 3362 + spin_lock_bh(&nf_conntrack_expect_lock); 3358 3363 exp = nf_ct_expect_find_get(info->net, &zone, &tuple); 3359 - if (!exp) 3364 + if (!exp) { 3365 + spin_unlock_bh(&nf_conntrack_expect_lock); 3366 + kfree_skb(skb2); 3360 3367 return -ENOENT; 3368 + } 3361 3369 3362 3370 if (cda[CTA_EXPECT_ID]) { 3363 3371 __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); 3364 3372 3365 3373 if (id != nf_expect_get_id(exp)) { 3366 3374 nf_ct_expect_put(exp); 3375 + spin_unlock_bh(&nf_conntrack_expect_lock); 3376 + kfree_skb(skb2); 3367 3377 return -ENOENT; 3368 3378 } 3369 - } 3370 - 3371 - skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 3372 - if (!skb2) { 3373 - nf_ct_expect_put(exp); 3374 - return -ENOMEM; 3375 3379 } 3376 3380 3377 3381 rcu_read_lock(); ··· 3384 3380 exp); 3385 3381 rcu_read_unlock(); 3386 3382 nf_ct_expect_put(exp); 3383 + spin_unlock_bh(&nf_conntrack_expect_lock); 3384 + 3387 3385 if (err <= 0) { 3388 3386 kfree_skb(skb2); 3389 3387 return -ENOMEM; ··· 3432 3426 if (err < 0) 3433 3427 return err; 3434 3428 3429 + spin_lock_bh(&nf_conntrack_expect_lock); 3430 + 3435 3431 /* bump usage count to 2 */ 3436 3432 exp = nf_ct_expect_find_get(info->net, &zone, &tuple); 3437 - if (!exp) 3433 + if (!exp) { 3434 + spin_unlock_bh(&nf_conntrack_expect_lock); 3438 3435 return -ENOENT; 3436 + } 3439 3437 3440 3438 if (cda[CTA_EXPECT_ID]) { 3441 3439 __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); 3442 3440 3443 3441 if (id != nf_expect_get_id(exp)) { 3444 3442 nf_ct_expect_put(exp); 3443 + spin_unlock_bh(&nf_conntrack_expect_lock); 3445 3444 return -ENOENT; 3446 3445 } 3447 3446 } 3448 3447 3449 3448 /* after list removal, usage count == 1 */ 3450 - spin_lock_bh(&nf_conntrack_expect_lock); 3451 3449 if (timer_delete(&exp->timeout)) { 3452 3450 nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, 3453 3451 nlmsg_report(info->nlh));