Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: conntrack: add conntrack event timestamp

Nadia Pinaeva writes:
I am working on a tool that allows collecting network performance
metrics by using conntrack events.
Start time of a conntrack entry is used to evaluate seen_reply
latency, therefore the sooner it is timestamped, the better the
precision is.
In particular, when using this tool to compare the performance of the
same feature implemented using iptables/nftables/OVS it is crucial
to have the entry timestamped earlier to see any difference.

At this time, conntrack events can only get timestamped at recv time in
userspace, so there can be some delay between the event being generated
and the userspace process consuming the message.

There is sys/net/netfilter/nf_conntrack_timestamp, which adds a
64bit timestamp (ns resolution) that records start and stop times,
but its not suited for this either, start time is the 'hashtable insertion
time', not 'conntrack allocation time'.

There is concern that moving the start-time moment to conntrack
allocation will add overhead in case of flooding, where conntrack
entries are allocated and released right away without getting inserted
into the hashtable.

Also, even if this was changed it would not with events other than
new (start time) and destroy (stop time).

Pablo suggested to add new CTA_TIMESTAMP_EVENT, this adds this feature.
The timestamp is recorded in case both events are requested and the
sys/net/netfilter/nf_conntrack_timestamp toggle is enabled.

Reported-by: Nadia Pinaeva <n.m.pinaeva@gmail.com>
Suggested-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

authored by

Florian Westphal and committed by
Pablo Neira Ayuso
601731fc 95f1c1e9

+61
+12
include/net/netfilter/nf_conntrack_ecache.h
··· 12 12 #include <linux/netfilter/nf_conntrack_common.h> 13 13 #include <linux/netfilter/nf_conntrack_tuple_common.h> 14 14 #include <net/netfilter/nf_conntrack_extend.h> 15 + #include <asm/local64.h> 15 16 16 17 enum nf_ct_ecache_state { 17 18 NFCT_ECACHE_DESTROY_FAIL, /* tried but failed to send destroy event */ ··· 21 20 22 21 struct nf_conntrack_ecache { 23 22 unsigned long cache; /* bitops want long */ 23 + #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP 24 + local64_t timestamp; /* event timestamp, in nanoseconds */ 25 + #endif 24 26 u16 ctmask; /* bitmask of ct events to be delivered */ 25 27 u16 expmask; /* bitmask of expect events to be delivered */ 26 28 u32 missed; /* missed events */ ··· 111 107 e = nf_ct_ecache_find(ct); 112 108 if (e == NULL) 113 109 return; 110 + 111 + #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP 112 + /* renew only if this is the first cached event, so that the 113 + * timestamp reflects the first, not the last, generated event. 114 + */ 115 + if (local64_read(&e->timestamp) && READ_ONCE(e->cache) == 0) 116 + local64_set(&e->timestamp, ktime_get_real_ns()); 117 + #endif 114 118 115 119 set_bit(event, &e->cache); 116 120 #endif
+23
net/netfilter/nf_conntrack_ecache.c
··· 162 162 return ret; 163 163 } 164 164 165 + static void nf_ct_ecache_tstamp_refresh(struct nf_conntrack_ecache *e) 166 + { 167 + #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP 168 + if (local64_read(&e->timestamp)) 169 + local64_set(&e->timestamp, ktime_get_real_ns()); 170 + #endif 171 + } 172 + 165 173 int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct, 166 174 u32 portid, int report) 167 175 { ··· 193 185 194 186 /* This is a resent of a destroy event? If so, skip missed */ 195 187 missed = e->portid ? 0 : e->missed; 188 + 189 + nf_ct_ecache_tstamp_refresh(e); 196 190 197 191 ret = __nf_conntrack_eventmask_report(e, events, missed, &item); 198 192 if (unlikely(ret < 0 && (events & (1 << IPCT_DESTROY)))) { ··· 307 297 } 308 298 } 309 299 300 + static void nf_ct_ecache_tstamp_new(const struct nf_conn *ct, struct nf_conntrack_ecache *e) 301 + { 302 + #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP 303 + u64 ts = 0; 304 + 305 + if (nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP)) 306 + ts = ktime_get_real_ns(); 307 + 308 + local64_set(&e->timestamp, ts); 309 + #endif 310 + } 311 + 310 312 bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) 311 313 { 312 314 struct net *net = nf_ct_net(ct); ··· 348 326 349 327 e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp); 350 328 if (e) { 329 + nf_ct_ecache_tstamp_new(ct, e); 351 330 e->ctmask = ctmask; 352 331 e->expmask = expmask; 353 332 }
+25
net/netfilter/nf_conntrack_netlink.c
··· 383 383 #endif 384 384 385 385 #ifdef CONFIG_NF_CONNTRACK_EVENTS 386 + static int 387 + ctnetlink_dump_event_timestamp(struct sk_buff *skb, const struct nf_conn *ct) 388 + { 389 + #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP 390 + const struct nf_conntrack_ecache *e = nf_ct_ecache_find(ct); 391 + 392 + if (e) { 393 + u64 ts = local64_read(&e->timestamp); 394 + 395 + if (ts) 396 + return nla_put_be64(skb, CTA_TIMESTAMP_EVENT, 397 + cpu_to_be64(ts), CTA_TIMESTAMP_PAD); 398 + } 399 + #endif 400 + return 0; 401 + } 402 + 386 403 static inline int ctnetlink_label_size(const struct nf_conn *ct) 387 404 { 388 405 struct nf_conn_labels *labels = nf_ct_labels_find(ct); ··· 734 717 #endif 735 718 + ctnetlink_proto_size(ct) 736 719 + ctnetlink_label_size(ct) 720 + #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP 721 + + nla_total_size(sizeof(u64)) /* CTA_TIMESTAMP_EVENT */ 722 + #endif 737 723 ; 738 724 } 739 725 ··· 858 838 if (ctnetlink_dump_mark(skb, ct, events & (1 << IPCT_MARK))) 859 839 goto nla_put_failure; 860 840 #endif 841 + 842 + if (ctnetlink_dump_event_timestamp(skb, ct)) 843 + goto nla_put_failure; 844 + 861 845 nlmsg_end(skb, nlh); 862 846 err = nfnetlink_send(skb, net, item->portid, group, item->report, 863 847 GFP_ATOMIC); ··· 1581 1557 .len = NF_CT_LABELS_MAX_SIZE }, 1582 1558 [CTA_FILTER] = { .type = NLA_NESTED }, 1583 1559 [CTA_STATUS_MASK] = { .type = NLA_U32 }, 1560 + [CTA_TIMESTAMP_EVENT] = { .type = NLA_REJECT }, 1584 1561 }; 1585 1562 1586 1563 static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data)