Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: conntrack: remove timer from ecache extension

This brings the (per-conntrack) ecache extension back to 24 bytes in size
(was 152 byte on x86_64 with lockdep on).

When event delivery fails, re-delivery is attempted via work queue.

Redelivery is attempted at least every 0.1 seconds, but can happen
more frequently if userspace is not congested.

The nf_ct_release_dying_list() function is removed.
With this patch, ownership of the to-be-redelivered conntracks
(on-dying-list-with-DYING-bit not yet set) is with the work queue,
which will release the references once event is out.

Joint work with Pablo Neira Ayuso.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

authored by

Florian Westphal and committed by
Pablo Neira Ayuso
9500507c f6b50824

+124 -72
+24 -2
include/net/netfilter/nf_conntrack_ecache.h
··· 18 18 u16 ctmask; /* bitmask of ct events to be delivered */ 19 19 u16 expmask; /* bitmask of expect events to be delivered */ 20 20 u32 portid; /* netlink portid of destroyer */ 21 - struct timer_list timeout; 22 21 }; 23 22 24 23 static inline struct nf_conntrack_ecache * ··· 215 216 216 217 int nf_conntrack_ecache_init(void); 217 218 void nf_conntrack_ecache_fini(void); 218 - #else /* CONFIG_NF_CONNTRACK_EVENTS */ 219 219 220 + static inline void nf_conntrack_ecache_delayed_work(struct net *net) 221 + { 222 + if (!delayed_work_pending(&net->ct.ecache_dwork)) { 223 + schedule_delayed_work(&net->ct.ecache_dwork, HZ); 224 + net->ct.ecache_dwork_pending = true; 225 + } 226 + } 227 + 228 + static inline void nf_conntrack_ecache_work(struct net *net) 229 + { 230 + if (net->ct.ecache_dwork_pending) { 231 + net->ct.ecache_dwork_pending = false; 232 + mod_delayed_work(system_wq, &net->ct.ecache_dwork, 0); 233 + } 234 + } 235 + #else /* CONFIG_NF_CONNTRACK_EVENTS */ 220 236 static inline void nf_conntrack_event_cache(enum ip_conntrack_events event, 221 237 struct nf_conn *ct) {} 222 238 static inline int nf_conntrack_eventmask_report(unsigned int eventmask, ··· 267 253 } 268 254 269 255 static inline void nf_conntrack_ecache_fini(void) 256 + { 257 + } 258 + 259 + static inline void nf_conntrack_ecache_delayed_work(struct net *net) 260 + { 261 + } 262 + 263 + static inline void nf_conntrack_ecache_work(struct net *net) 270 264 { 271 265 } 272 266 #endif /* CONFIG_NF_CONNTRACK_EVENTS */
+5 -1
include/net/netns/conntrack.h
··· 4 4 #include <linux/list.h> 5 5 #include <linux/list_nulls.h> 6 6 #include <linux/atomic.h> 7 + #include <linux/workqueue.h> 7 8 #include <linux/netfilter/nf_conntrack_tcp.h> 8 9 #include <linux/seqlock.h> 9 10 ··· 74 73 struct netns_ct { 75 74 atomic_t count; 76 75 unsigned int expect_count; 76 + #ifdef CONFIG_NF_CONNTRACK_EVENTS 77 + struct delayed_work ecache_dwork; 78 + bool ecache_dwork_pending; 79 + #endif 77 80 #ifdef CONFIG_SYSCTL 78 81 struct ctl_table_header *sysctl_header; 79 82 struct ctl_table_header *acct_sysctl_header; ··· 87 82 #endif 88 83 char *slabname; 89 84 unsigned int sysctl_log_invalid; /* Log invalid packets */ 90 - unsigned int sysctl_events_retry_timeout; 91 85 int sysctl_events; 92 86 int sysctl_acct; 93 87 int sysctl_auto_assign_helper;
+9 -59
net/netfilter/nf_conntrack_core.c
··· 352 352 local_bh_enable(); 353 353 } 354 354 355 - static void death_by_event(unsigned long ul_conntrack) 356 - { 357 - struct nf_conn *ct = (void *)ul_conntrack; 358 - struct net *net = nf_ct_net(ct); 359 - struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); 360 - 361 - BUG_ON(ecache == NULL); 362 - 363 - if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) { 364 - /* bad luck, let's retry again */ 365 - ecache->timeout.expires = jiffies + 366 - (prandom_u32() % net->ct.sysctl_events_retry_timeout); 367 - add_timer(&ecache->timeout); 368 - return; 369 - } 370 - /* we've got the event delivered, now it's dying */ 371 - set_bit(IPS_DYING_BIT, &ct->status); 372 - nf_ct_put(ct); 373 - } 374 - 375 - static void nf_ct_dying_timeout(struct nf_conn *ct) 376 - { 377 - struct net *net = nf_ct_net(ct); 378 - struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); 379 - 380 - BUG_ON(ecache == NULL); 381 - 382 - /* set a new timer to retry event delivery */ 383 - setup_timer(&ecache->timeout, death_by_event, (unsigned long)ct); 384 - ecache->timeout.expires = jiffies + 385 - (prandom_u32() % net->ct.sysctl_events_retry_timeout); 386 - add_timer(&ecache->timeout); 387 - } 388 - 389 355 bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) 390 356 { 391 357 struct nf_conn_tstamp *tstamp; ··· 360 394 if (tstamp && tstamp->stop == 0) 361 395 tstamp->stop = ktime_to_ns(ktime_get_real()); 362 396 363 - if (!nf_ct_is_dying(ct) && 364 - unlikely(nf_conntrack_event_report(IPCT_DESTROY, ct, 365 - portid, report) < 0)) { 397 + if (nf_ct_is_dying(ct)) 398 + goto delete; 399 + 400 + if (nf_conntrack_event_report(IPCT_DESTROY, ct, 401 + portid, report) < 0) { 366 402 /* destroy event was not delivered */ 367 403 nf_ct_delete_from_lists(ct); 368 - nf_ct_dying_timeout(ct); 404 + nf_conntrack_ecache_delayed_work(nf_ct_net(ct)); 369 405 return false; 370 406 } 407 + 408 + nf_conntrack_ecache_work(nf_ct_net(ct)); 371 409 set_bit(IPS_DYING_BIT, &ct->status); 410 + delete: 372 411 nf_ct_delete_from_lists(ct); 373 412 nf_ct_put(ct); 374 413 return true; ··· 1435 1464 } 1436 1465 EXPORT_SYMBOL_GPL(nf_conntrack_flush_report); 1437 1466 1438 - static void nf_ct_release_dying_list(struct net *net) 1439 - { 1440 - struct nf_conntrack_tuple_hash *h; 1441 - struct nf_conn *ct; 1442 - struct hlist_nulls_node *n; 1443 - int cpu; 1444 - 1445 - for_each_possible_cpu(cpu) { 1446 - struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); 1447 - 1448 - spin_lock_bh(&pcpu->lock); 1449 - hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { 1450 - ct = nf_ct_tuplehash_to_ctrack(h); 1451 - /* never fails to remove them, no listeners at this point */ 1452 - nf_ct_kill(ct); 1453 - } 1454 - spin_unlock_bh(&pcpu->lock); 1455 - } 1456 - } 1457 - 1458 1467 static int untrack_refs(void) 1459 1468 { 1460 1469 int cnt = 0, cpu; ··· 1499 1548 busy = 0; 1500 1549 list_for_each_entry(net, net_exit_list, exit_list) { 1501 1550 nf_ct_iterate_cleanup(net, kill_all, NULL, 0, 0); 1502 - nf_ct_release_dying_list(net); 1503 1551 if (atomic_read(&net->ct.count) != 0) 1504 1552 busy = 1; 1505 1553 }
+86 -10
net/netfilter/nf_conntrack_ecache.c
··· 29 29 30 30 static DEFINE_MUTEX(nf_ct_ecache_mutex); 31 31 32 + #define ECACHE_RETRY_WAIT (HZ/10) 33 + 34 + enum retry_state { 35 + STATE_CONGESTED, 36 + STATE_RESTART, 37 + STATE_DONE, 38 + }; 39 + 40 + static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) 41 + { 42 + struct nf_conn *refs[16]; 43 + struct nf_conntrack_tuple_hash *h; 44 + struct hlist_nulls_node *n; 45 + unsigned int evicted = 0; 46 + enum retry_state ret = STATE_DONE; 47 + 48 + spin_lock(&pcpu->lock); 49 + 50 + hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { 51 + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 52 + 53 + if (nf_ct_is_dying(ct)) 54 + continue; 55 + 56 + if (nf_conntrack_event(IPCT_DESTROY, ct)) { 57 + ret = STATE_CONGESTED; 58 + break; 59 + } 60 + 61 + /* we've got the event delivered, now it's dying */ 62 + set_bit(IPS_DYING_BIT, &ct->status); 63 + refs[evicted] = ct; 64 + 65 + if (++evicted >= ARRAY_SIZE(refs)) { 66 + ret = STATE_RESTART; 67 + break; 68 + } 69 + } 70 + 71 + spin_unlock(&pcpu->lock); 72 + 73 + /* can't _put while holding lock */ 74 + while (evicted) 75 + nf_ct_put(refs[--evicted]); 76 + 77 + return ret; 78 + } 79 + 80 + static void ecache_work(struct work_struct *work) 81 + { 82 + struct netns_ct *ctnet = 83 + container_of(work, struct netns_ct, ecache_dwork.work); 84 + int cpu, delay = -1; 85 + struct ct_pcpu *pcpu; 86 + 87 + local_bh_disable(); 88 + 89 + for_each_possible_cpu(cpu) { 90 + enum retry_state ret; 91 + 92 + pcpu = per_cpu_ptr(ctnet->pcpu_lists, cpu); 93 + 94 + ret = ecache_work_evict_list(pcpu); 95 + 96 + switch (ret) { 97 + case STATE_CONGESTED: 98 + delay = ECACHE_RETRY_WAIT; 99 + goto out; 100 + case STATE_RESTART: 101 + delay = 0; 102 + break; 103 + case STATE_DONE: 104 + break; 105 + } 106 + } 107 + 108 + out: 109 + local_bh_enable(); 110 + 111 + ctnet->ecache_dwork_pending = delay > 0; 112 + if (delay >= 0) 113 + schedule_delayed_work(&ctnet->ecache_dwork, delay); 114 + } 115 + 32 116 /* deliver cached events and clear cache entry - must be called with locally 33 117 * disabled softirqs */ 34 118 void nf_ct_deliver_cached_events(struct nf_conn *ct) ··· 241 157 242 158 #define NF_CT_EVENTS_DEFAULT 1 243 159 static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; 244 - static int nf_ct_events_retry_timeout __read_mostly = 15*HZ; 245 160 246 161 #ifdef CONFIG_SYSCTL 247 162 static struct ctl_table event_sysctl_table[] = { ··· 250 167 .maxlen = sizeof(unsigned int), 251 168 .mode = 0644, 252 169 .proc_handler = proc_dointvec, 253 - }, 254 - { 255 - .procname = "nf_conntrack_events_retry_timeout", 256 - .data = &init_net.ct.sysctl_events_retry_timeout, 257 - .maxlen = sizeof(unsigned int), 258 - .mode = 0644, 259 - .proc_handler = proc_dointvec_jiffies, 260 170 }, 261 171 {} 262 172 }; ··· 272 196 goto out; 273 197 274 198 table[0].data = &net->ct.sysctl_events; 275 - table[1].data = &net->ct.sysctl_events_retry_timeout; 276 199 277 200 /* Don't export sysctls to unprivileged users */ 278 201 if (net->user_ns != &init_user_ns) ··· 313 238 int nf_conntrack_ecache_pernet_init(struct net *net) 314 239 { 315 240 net->ct.sysctl_events = nf_ct_events; 316 - net->ct.sysctl_events_retry_timeout = nf_ct_events_retry_timeout; 241 + INIT_DELAYED_WORK(&net->ct.ecache_dwork, ecache_work); 317 242 return nf_conntrack_event_init_sysctl(net); 318 243 } 319 244 320 245 void nf_conntrack_ecache_pernet_fini(struct net *net) 321 246 { 247 + cancel_delayed_work_sync(&net->ct.ecache_dwork); 322 248 nf_conntrack_event_fini_sysctl(net); 323 249 } 324 250