Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: nf_conntrack: add IPS_OFFLOAD status bit

This new bit tells us that the conntrack entry is owned by the flow
table offload infrastructure.

# cat /proc/net/nf_conntrack
ipv4 2 tcp 6 src=10.141.10.2 dst=147.75.205.195 sport=36392 dport=443 src=147.75.205.195 dst=192.168.2.195 sport=443 dport=36392 [OFFLOAD] mark=0 zone=0 use=2

Note the [OFFLOAD] tag in the listing.

The timer of such conntrack entries look like stopped from userspace.
In practise, to make sure the conntrack entry does not go away, the
conntrack timer is periodically set to an arbitrary large value that
gets refreshed on every iteration from the garbage collector, so it
never expires- and they display no internal state in the case of TCP
flows. This allows us to save a bitcheck from the packet path via
nf_ct_is_expired().

Conntrack entries that have been offloaded to the flow table
infrastructure cannot be deleted/flushed via ctnetlink. The flow table
infrastructure is also responsible for releasing this conntrack entry.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

+50 -6
+5 -1
include/uapi/linux/netfilter/nf_conntrack_common.h
··· 101 101 IPS_HELPER_BIT = 13, 102 102 IPS_HELPER = (1 << IPS_HELPER_BIT), 103 103 104 + /* Conntrack has been offloaded to flow table. */ 105 + IPS_OFFLOAD_BIT = 14, 106 + IPS_OFFLOAD = (1 << IPS_OFFLOAD_BIT), 107 + 104 108 /* Be careful here, modifying these bits can make things messy, 105 109 * so don't let users modify them directly. 106 110 */ 107 111 IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK | 108 112 IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING | 109 - IPS_SEQ_ADJUST | IPS_TEMPLATE), 113 + IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_OFFLOAD), 110 114 111 115 __IPS_MAX_BIT = 14, 112 116 };
+20
net/netfilter/nf_conntrack_core.c
··· 901 901 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { 902 902 tmp = nf_ct_tuplehash_to_ctrack(h); 903 903 904 + if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) 905 + continue; 906 + 904 907 if (nf_ct_is_expired(tmp)) { 905 908 nf_ct_gc_expired(tmp); 906 909 continue; ··· 978 975 return false; 979 976 } 980 977 978 + #define DAY (86400 * HZ) 979 + 980 + /* Set an arbitrary timeout large enough not to ever expire, this save 981 + * us a check for the IPS_OFFLOAD_BIT from the packet path via 982 + * nf_ct_is_expired(). 983 + */ 984 + static void nf_ct_offload_timeout(struct nf_conn *ct) 985 + { 986 + if (nf_ct_expires(ct) < DAY / 2) 987 + ct->timeout = nfct_time_stamp + DAY; 988 + } 989 + 981 990 static void gc_worker(struct work_struct *work) 982 991 { 983 992 unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u); ··· 1026 1011 tmp = nf_ct_tuplehash_to_ctrack(h); 1027 1012 1028 1013 scanned++; 1014 + if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { 1015 + nf_ct_offload_timeout(tmp); 1016 + continue; 1017 + } 1018 + 1029 1019 if (nf_ct_is_expired(tmp)) { 1030 1020 nf_ct_gc_expired(tmp); 1031 1021 expired_count++;
+14 -1
net/netfilter/nf_conntrack_netlink.c
··· 1110 1110 .len = NF_CT_LABELS_MAX_SIZE }, 1111 1111 }; 1112 1112 1113 + static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data) 1114 + { 1115 + if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) 1116 + return 0; 1117 + 1118 + return ctnetlink_filter_match(ct, data); 1119 + } 1120 + 1113 1121 static int ctnetlink_flush_conntrack(struct net *net, 1114 1122 const struct nlattr * const cda[], 1115 1123 u32 portid, int report) ··· 1130 1122 return PTR_ERR(filter); 1131 1123 } 1132 1124 1133 - nf_ct_iterate_cleanup_net(net, ctnetlink_filter_match, filter, 1125 + nf_ct_iterate_cleanup_net(net, ctnetlink_flush_iterate, filter, 1134 1126 portid, report); 1135 1127 kfree(filter); 1136 1128 ··· 1175 1167 return -ENOENT; 1176 1168 1177 1169 ct = nf_ct_tuplehash_to_ctrack(h); 1170 + 1171 + if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) { 1172 + nf_ct_put(ct); 1173 + return -EBUSY; 1174 + } 1178 1175 1179 1176 if (cda[CTA_ID]) { 1180 1177 u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID]));
+3
net/netfilter/nf_conntrack_proto_tcp.c
··· 305 305 /* Print out the private part of the conntrack. */ 306 306 static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) 307 307 { 308 + if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) 309 + return; 310 + 308 311 seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]); 309 312 } 310 313 #endif
+8 -4
net/netfilter/nf_conntrack_standalone.c
··· 309 309 WARN_ON(!l4proto); 310 310 311 311 ret = -ENOSPC; 312 - seq_printf(s, "%-8s %u %-8s %u %ld ", 312 + seq_printf(s, "%-8s %u %-8s %u ", 313 313 l3proto_name(l3proto->l3proto), nf_ct_l3num(ct), 314 - l4proto_name(l4proto->l4proto), nf_ct_protonum(ct), 315 - nf_ct_expires(ct) / HZ); 314 + l4proto_name(l4proto->l4proto), nf_ct_protonum(ct)); 315 + 316 + if (!test_bit(IPS_OFFLOAD_BIT, &ct->status)) 317 + seq_printf(s, "%ld ", nf_ct_expires(ct) / HZ); 316 318 317 319 if (l4proto->print_conntrack) 318 320 l4proto->print_conntrack(s, ct); ··· 341 339 if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) 342 340 goto release; 343 341 344 - if (test_bit(IPS_ASSURED_BIT, &ct->status)) 342 + if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) 343 + seq_puts(s, "[OFFLOAD] "); 344 + else if (test_bit(IPS_ASSURED_BIT, &ct->status)) 345 345 seq_puts(s, "[ASSURED] "); 346 346 347 347 if (seq_has_overflowed(s))