Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: conntrack: resched in nf_ct_iterate_cleanup

Ulrich reports soft lockup with following (shortened) callchain:

NMI watchdog: BUG: soft lockup - CPU#1 stuck for 22s!
__netif_receive_skb_core+0x6e4/0x774
process_backlog+0x94/0x160
net_rx_action+0x88/0x178
call_do_softirq+0x24/0x3c
do_softirq+0x54/0x6c
__local_bh_enable_ip+0x7c/0xbc
nf_ct_iterate_cleanup+0x11c/0x22c [nf_conntrack]
masq_inet_event+0x20/0x30 [nf_nat_masquerade_ipv6]
atomic_notifier_call_chain+0x1c/0x2c
ipv6_del_addr+0x1bc/0x220 [ipv6]

Problem is that nf_ct_iterate_cleanup can run for a very long time
since it can be interrupted by softirq processing.
Moreover, atomic_notifier_call_chain runs with rcu readlock held.

So lets call cond_resched() in nf_ct_iterate_cleanup and defer
the call to a work queue for the atomic_notifier_call_chain case.

We also need another cond_resched in get_next_corpse, since we
have to deal with iter() always returning false, in that case
get_next_corpse will walk entire conntrack table.

Reported-by: Ulrich Weber <uw@ocedo.com>
Tested-by: Ulrich Weber <uw@ocedo.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

authored by

Florian Westphal and committed by
Pablo Neira Ayuso
d93c6258 53729eb1

+76 -3
+71 -3
net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
··· 21 21 #include <net/ipv6.h> 22 22 #include <net/netfilter/ipv6/nf_nat_masquerade.h> 23 23 24 + #define MAX_WORK_COUNT 16 25 + 26 + static atomic_t v6_worker_count; 27 + 24 28 unsigned int 25 29 nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, 26 30 const struct net_device *out) ··· 82 78 .notifier_call = masq_device_event, 83 79 }; 84 80 81 + struct masq_dev_work { 82 + struct work_struct work; 83 + struct net *net; 84 + int ifindex; 85 + }; 86 + 87 + static void iterate_cleanup_work(struct work_struct *work) 88 + { 89 + struct masq_dev_work *w; 90 + long index; 91 + 92 + w = container_of(work, struct masq_dev_work, work); 93 + 94 + index = w->ifindex; 95 + nf_ct_iterate_cleanup(w->net, device_cmp, (void *)index, 0, 0); 96 + 97 + put_net(w->net); 98 + kfree(w); 99 + atomic_dec(&v6_worker_count); 100 + module_put(THIS_MODULE); 101 + } 102 + 103 + /* ipv6 inet notifier is an atomic notifier, i.e. we cannot 104 + * schedule. 105 + * 106 + * Unfortunately, nf_ct_iterate_cleanup can run for a long 107 + * time if there are lots of conntracks and the system 108 + * handles high softirq load, so it frequently calls cond_resched 109 + * while iterating the conntrack table. 110 + * 111 + * So we defer nf_ct_iterate_cleanup walk to the system workqueue. 112 + * 113 + * As we can have 'a lot' of inet_events (depending on amount 114 + * of ipv6 addresses being deleted), we also need to add an upper 115 + * limit to the number of queued work items. 116 + */ 85 117 static int masq_inet_event(struct notifier_block *this, 86 118 unsigned long event, void *ptr) 87 119 { 88 120 struct inet6_ifaddr *ifa = ptr; 89 - struct netdev_notifier_info info; 121 + const struct net_device *dev; 122 + struct masq_dev_work *w; 123 + struct net *net; 90 124 91 - netdev_notifier_info_init(&info, ifa->idev->dev); 92 - return masq_device_event(this, event, &info); 125 + if (event != NETDEV_DOWN || 126 + atomic_read(&v6_worker_count) >= MAX_WORK_COUNT) 127 + return NOTIFY_DONE; 128 + 129 + dev = ifa->idev->dev; 130 + net = maybe_get_net(dev_net(dev)); 131 + if (!net) 132 + return NOTIFY_DONE; 133 + 134 + if (!try_module_get(THIS_MODULE)) 135 + goto err_module; 136 + 137 + w = kmalloc(sizeof(*w), GFP_ATOMIC); 138 + if (w) { 139 + atomic_inc(&v6_worker_count); 140 + 141 + INIT_WORK(&w->work, iterate_cleanup_work); 142 + w->ifindex = dev->ifindex; 143 + w->net = net; 144 + schedule_work(&w->work); 145 + 146 + return NOTIFY_DONE; 147 + } 148 + 149 + module_put(THIS_MODULE); 150 + err_module: 151 + put_net(net); 152 + return NOTIFY_DONE; 93 153 } 94 154 95 155 static struct notifier_block masq_inet_notifier = {
+5
net/netfilter/nf_conntrack_core.c
··· 1412 1412 } 1413 1413 spin_unlock(lockp); 1414 1414 local_bh_enable(); 1415 + cond_resched(); 1415 1416 } 1416 1417 1417 1418 for_each_possible_cpu(cpu) { ··· 1425 1424 set_bit(IPS_DYING_BIT, &ct->status); 1426 1425 } 1427 1426 spin_unlock_bh(&pcpu->lock); 1427 + cond_resched(); 1428 1428 } 1429 1429 return NULL; 1430 1430 found: ··· 1442 1440 struct nf_conn *ct; 1443 1441 unsigned int bucket = 0; 1444 1442 1443 + might_sleep(); 1444 + 1445 1445 while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { 1446 1446 /* Time to push up daises... */ 1447 1447 if (del_timer(&ct->timeout)) ··· 1452 1448 /* ... else the timer will get him soon. */ 1453 1449 1454 1450 nf_ct_put(ct); 1451 + cond_resched(); 1455 1452 } 1456 1453 } 1457 1454 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);