Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: ctnetlink: fix soft lockup when netlink adds new entries (v2)

Marcell Zambo and Janos Farago noticed and reported that when
new conntrack entries are added via netlink and the conntrack table
gets full, soft lockup happens. This is because the nf_conntrack_lock
is held while nf_conntrack_alloc is called, which is in turn wants
to lock nf_conntrack_lock while evicting entries from the full table.

The patch fixes the soft lockup with limiting the holding of the
nf_conntrack_lock to the minimum, where it's absolutely required.
It required to extend (and thus change) nf_conntrack_hash_insert
so that it makes sure conntrack and ctnetlink do not add the same entry
twice to the conntrack table.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

authored by

Jozsef Kadlecsik and committed by
Pablo Neira Ayuso
7d367e06 27907288

+51 -35
+1 -1
include/net/netfilter/nf_conntrack.h
··· 209 209 __nf_conntrack_find(struct net *net, u16 zone, 210 210 const struct nf_conntrack_tuple *tuple); 211 211 212 - extern void nf_conntrack_hash_insert(struct nf_conn *ct); 212 + extern int nf_conntrack_hash_check_insert(struct nf_conn *ct); 213 213 extern void nf_ct_delete_from_lists(struct nf_conn *ct); 214 214 extern void nf_ct_insert_dying_list(struct nf_conn *ct); 215 215
+34 -4
net/netfilter/nf_conntrack_core.c
··· 404 404 &net->ct.hash[repl_hash]); 405 405 } 406 406 407 - void nf_conntrack_hash_insert(struct nf_conn *ct) 407 + int 408 + nf_conntrack_hash_check_insert(struct nf_conn *ct) 408 409 { 409 410 struct net *net = nf_ct_net(ct); 410 411 unsigned int hash, repl_hash; 412 + struct nf_conntrack_tuple_hash *h; 413 + struct hlist_nulls_node *n; 411 414 u16 zone; 412 415 413 416 zone = nf_ct_zone(ct); 414 - hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 415 - repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 417 + hash = hash_conntrack(net, zone, 418 + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 419 + repl_hash = hash_conntrack(net, zone, 420 + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 416 421 422 + spin_lock_bh(&nf_conntrack_lock); 423 + 424 + /* See if there's one in the list already, including reverse */ 425 + hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) 426 + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 427 + &h->tuple) && 428 + zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) 429 + goto out; 430 + hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) 431 + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, 432 + &h->tuple) && 433 + zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) 434 + goto out; 435 + 436 + add_timer(&ct->timeout); 437 + nf_conntrack_get(&ct->ct_general); 417 438 __nf_conntrack_hash_insert(ct, hash, repl_hash); 439 + NF_CT_STAT_INC(net, insert); 440 + spin_unlock_bh(&nf_conntrack_lock); 441 + 442 + return 0; 443 + 444 + out: 445 + NF_CT_STAT_INC(net, insert_failed); 446 + spin_unlock_bh(&nf_conntrack_lock); 447 + return -EEXIST; 418 448 } 419 - EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert); 449 + EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); 420 450 421 451 /* Confirm a connection given skb; places it in hash table */ 422 452 int
+16 -30
net/netfilter/nf_conntrack_netlink.c
··· 1367 1367 nf_ct_protonum(ct)); 1368 1368 if (helper == NULL) { 1369 1369 rcu_read_unlock(); 1370 - spin_unlock_bh(&nf_conntrack_lock); 1371 1370 #ifdef CONFIG_MODULES 1372 1371 if (request_module("nfct-helper-%s", helpname) < 0) { 1373 - spin_lock_bh(&nf_conntrack_lock); 1374 1372 err = -EOPNOTSUPP; 1375 1373 goto err1; 1376 1374 } 1377 1375 1378 - spin_lock_bh(&nf_conntrack_lock); 1379 1376 rcu_read_lock(); 1380 1377 helper = __nf_conntrack_helper_find(helpname, 1381 1378 nf_ct_l3num(ct), ··· 1465 1468 if (tstamp) 1466 1469 tstamp->start = ktime_to_ns(ktime_get_real()); 1467 1470 1468 - add_timer(&ct->timeout); 1469 - nf_conntrack_hash_insert(ct); 1471 + err = nf_conntrack_hash_check_insert(ct); 1472 + if (err < 0) 1473 + goto err2; 1474 + 1470 1475 rcu_read_unlock(); 1471 1476 1472 1477 return ct; ··· 1489 1490 struct nf_conntrack_tuple otuple, rtuple; 1490 1491 struct nf_conntrack_tuple_hash *h = NULL; 1491 1492 struct nfgenmsg *nfmsg = nlmsg_data(nlh); 1493 + struct nf_conn *ct; 1492 1494 u_int8_t u3 = nfmsg->nfgen_family; 1493 1495 u16 zone; 1494 1496 int err; ··· 1510 1510 return err; 1511 1511 } 1512 1512 1513 - spin_lock_bh(&nf_conntrack_lock); 1514 1513 if (cda[CTA_TUPLE_ORIG]) 1515 - h = __nf_conntrack_find(net, zone, &otuple); 1514 + h = nf_conntrack_find_get(net, zone, &otuple); 1516 1515 else if (cda[CTA_TUPLE_REPLY]) 1517 - h = __nf_conntrack_find(net, zone, &rtuple); 1516 + h = nf_conntrack_find_get(net, zone, &rtuple); 1518 1517 1519 1518 if (h == NULL) { 1520 1519 err = -ENOENT; 1521 1520 if (nlh->nlmsg_flags & NLM_F_CREATE) { 1522 - struct nf_conn *ct; 1523 1521 enum ip_conntrack_events events; 1524 1522 1525 1523 ct = ctnetlink_create_conntrack(net, zone, cda, &otuple, 1526 1524 &rtuple, u3); 1527 - if (IS_ERR(ct)) { 1528 - err = PTR_ERR(ct); 1529 - goto out_unlock; 1530 - } 1525 + if (IS_ERR(ct)) 1526 + return PTR_ERR(ct); 1527 + 1531 1528 err = 0; 1532 - nf_conntrack_get(&ct->ct_general); 1533 - spin_unlock_bh(&nf_conntrack_lock); 1534 1529 if (test_bit(IPS_EXPECTED_BIT, &ct->status)) 1535 1530 events = IPCT_RELATED; 1536 1531 else ··· 1540 1545 ct, NETLINK_CB(skb).pid, 1541 1546 nlmsg_report(nlh)); 1542 1547 nf_ct_put(ct); 1543 - } else 1544 - spin_unlock_bh(&nf_conntrack_lock); 1548 + } 1545 1549 1546 1550 return err; 1547 1551 } 1548 1552 /* implicit 'else' */ 1549 1553 1550 - /* We manipulate the conntrack inside the global conntrack table lock, 1551 - * so there's no need to increase the refcount */ 1552 1554 err = -EEXIST; 1555 + ct = nf_ct_tuplehash_to_ctrack(h); 1553 1556 if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { 1554 - struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 1555 - 1557 + spin_lock_bh(&nf_conntrack_lock); 1556 1558 err = ctnetlink_change_conntrack(ct, cda); 1559 + spin_unlock_bh(&nf_conntrack_lock); 1557 1560 if (err == 0) { 1558 - nf_conntrack_get(&ct->ct_general); 1559 - spin_unlock_bh(&nf_conntrack_lock); 1560 1561 nf_conntrack_eventmask_report((1 << IPCT_REPLY) | 1561 1562 (1 << IPCT_ASSURED) | 1562 1563 (1 << IPCT_HELPER) | ··· 1561 1570 (1 << IPCT_MARK), 1562 1571 ct, NETLINK_CB(skb).pid, 1563 1572 nlmsg_report(nlh)); 1564 - nf_ct_put(ct); 1565 - } else 1566 - spin_unlock_bh(&nf_conntrack_lock); 1567 - 1568 - return err; 1573 + } 1569 1574 } 1570 1575 1571 - out_unlock: 1572 - spin_unlock_bh(&nf_conntrack_lock); 1576 + nf_ct_put(ct); 1573 1577 return err; 1574 1578 } 1575 1579