Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'act_ct-Software-offload-of-conntrack_in'

Paul Blakey says:

====================
act_ct: Software offload of conntrack_in

This series adds software offload of connections with an established
ct state using the NF flow table offload infrastructure, so
once such flows are offloaded, they will not pass through conntrack
again, and instead act_ct will restore the conntrack info metadata
on the skb to the state it had on the offload event - established.

Act_ct maintains an FT instance per ct zone. Flow table entries
are created, per ct connection, when connections enter an established
state and deleted otherwise. Once an entry is created, the FT assumes
ownership of the entry, and manages it's aging.

On the datapath, first lookup the skb in the zone's FT before going
into conntrack, and if a matching flow is found, restore the conntrack
info metadata on the skb, and skip calling conntrack.

Note that this patchset is part of the connection tracking offload feature.
Hardware offload of connections with an established ct state series will follow
this one.

Changelog:
v1->v2:
Removed now unused netfilter patches
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+355 -2
+2
include/net/tc_act/tc_ct.h
··· 25 25 u16 ct_action; 26 26 27 27 struct rcu_head rcu; 28 + 29 + struct tcf_ct_flow_table *ct_ft; 28 30 }; 29 31 30 32 struct tcf_ct {
+1 -1
net/sched/Kconfig
··· 972 972 973 973 config NET_ACT_CT 974 974 tristate "connection tracking tc action" 975 - depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT 975 + depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT && NF_FLOW_TABLE 976 976 help 977 977 Say Y here to allow sending the packets to conntrack module. 978 978
+352 -1
net/sched/act_ct.c
··· 15 15 #include <linux/pkt_cls.h> 16 16 #include <linux/ip.h> 17 17 #include <linux/ipv6.h> 18 + #include <linux/rhashtable.h> 18 19 #include <net/netlink.h> 19 20 #include <net/pkt_sched.h> 20 21 #include <net/pkt_cls.h> ··· 25 24 #include <uapi/linux/tc_act/tc_ct.h> 26 25 #include <net/tc_act/tc_ct.h> 27 26 27 + #include <net/netfilter/nf_flow_table.h> 28 28 #include <net/netfilter/nf_conntrack.h> 29 29 #include <net/netfilter/nf_conntrack_core.h> 30 30 #include <net/netfilter/nf_conntrack_zones.h> 31 31 #include <net/netfilter/nf_conntrack_helper.h> 32 32 #include <net/netfilter/ipv6/nf_defrag_ipv6.h> 33 33 #include <uapi/linux/netfilter/nf_nat.h> 34 + 35 + static struct workqueue_struct *act_ct_wq; 36 + static struct rhashtable zones_ht; 37 + static DEFINE_SPINLOCK(zones_lock); 38 + 39 + struct tcf_ct_flow_table { 40 + struct rhash_head node; /* In zones tables */ 41 + 42 + struct rcu_work rwork; 43 + struct nf_flowtable nf_ft; 44 + u16 zone; 45 + u32 ref; 46 + 47 + bool dying; 48 + }; 49 + 50 + static const struct rhashtable_params zones_params = { 51 + .head_offset = offsetof(struct tcf_ct_flow_table, node), 52 + .key_offset = offsetof(struct tcf_ct_flow_table, zone), 53 + .key_len = sizeof_field(struct tcf_ct_flow_table, zone), 54 + .automatic_shrinking = true, 55 + }; 56 + 57 + static struct nf_flowtable_type flowtable_ct = { 58 + .owner = THIS_MODULE, 59 + }; 60 + 61 + static int tcf_ct_flow_table_get(struct tcf_ct_params *params) 62 + { 63 + struct tcf_ct_flow_table *ct_ft; 64 + int err = -ENOMEM; 65 + 66 + spin_lock_bh(&zones_lock); 67 + ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params); 68 + if (ct_ft) 69 + goto take_ref; 70 + 71 + ct_ft = kzalloc(sizeof(*ct_ft), GFP_ATOMIC); 72 + if (!ct_ft) 73 + goto err_alloc; 74 + 75 + ct_ft->zone = params->zone; 76 + err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params); 77 + if (err) 78 + goto err_insert; 79 + 80 + ct_ft->nf_ft.type = &flowtable_ct; 81 + err = nf_flow_table_init(&ct_ft->nf_ft); 82 + if (err) 83 + goto err_init; 84 + 85 + __module_get(THIS_MODULE); 86 + take_ref: 87 + params->ct_ft = ct_ft; 88 + ct_ft->ref++; 89 + spin_unlock_bh(&zones_lock); 90 + 91 + return 0; 92 + 93 + err_init: 94 + rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); 95 + err_insert: 96 + kfree(ct_ft); 97 + err_alloc: 98 + spin_unlock_bh(&zones_lock); 99 + return err; 100 + } 101 + 102 + static void tcf_ct_flow_table_cleanup_work(struct work_struct *work) 103 + { 104 + struct tcf_ct_flow_table *ct_ft; 105 + 106 + ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table, 107 + rwork); 108 + nf_flow_table_free(&ct_ft->nf_ft); 109 + kfree(ct_ft); 110 + 111 + module_put(THIS_MODULE); 112 + } 113 + 114 + static void tcf_ct_flow_table_put(struct tcf_ct_params *params) 115 + { 116 + struct tcf_ct_flow_table *ct_ft = params->ct_ft; 117 + 118 + spin_lock_bh(&zones_lock); 119 + if (--params->ct_ft->ref == 0) { 120 + rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); 121 + INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work); 122 + queue_rcu_work(act_ct_wq, &ct_ft->rwork); 123 + } 124 + spin_unlock_bh(&zones_lock); 125 + } 126 + 127 + static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, 128 + struct nf_conn *ct, 129 + bool tcp) 130 + { 131 + struct flow_offload *entry; 132 + int err; 133 + 134 + if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status)) 135 + return; 136 + 137 + entry = flow_offload_alloc(ct); 138 + if (!entry) { 139 + WARN_ON_ONCE(1); 140 + goto err_alloc; 141 + } 142 + 143 + if (tcp) { 144 + ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; 145 + ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; 146 + } 147 + 148 + err = flow_offload_add(&ct_ft->nf_ft, entry); 149 + if (err) 150 + goto err_add; 151 + 152 + return; 153 + 154 + err_add: 155 + flow_offload_free(entry); 156 + err_alloc: 157 + clear_bit(IPS_OFFLOAD_BIT, &ct->status); 158 + } 159 + 160 + static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft, 161 + struct nf_conn *ct, 162 + enum ip_conntrack_info ctinfo) 163 + { 164 + bool tcp = false; 165 + 166 + if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) 167 + return; 168 + 169 + switch (nf_ct_protonum(ct)) { 170 + case IPPROTO_TCP: 171 + tcp = true; 172 + if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) 173 + return; 174 + break; 175 + case IPPROTO_UDP: 176 + break; 177 + default: 178 + return; 179 + } 180 + 181 + if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) || 182 + ct->status & IPS_SEQ_ADJUST) 183 + return; 184 + 185 + tcf_ct_flow_table_add(ct_ft, ct, tcp); 186 + } 187 + 188 + static bool 189 + tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb, 190 + struct flow_offload_tuple *tuple) 191 + { 192 + struct flow_ports *ports; 193 + unsigned int thoff; 194 + struct iphdr *iph; 195 + 196 + if (!pskb_may_pull(skb, sizeof(*iph))) 197 + return false; 198 + 199 + iph = ip_hdr(skb); 200 + thoff = iph->ihl * 4; 201 + 202 + if (ip_is_fragment(iph) || 203 + unlikely(thoff != sizeof(struct iphdr))) 204 + return false; 205 + 206 + if (iph->protocol != IPPROTO_TCP && 207 + iph->protocol != IPPROTO_UDP) 208 + return false; 209 + 210 + if (iph->ttl <= 1) 211 + return false; 212 + 213 + if (!pskb_may_pull(skb, thoff + sizeof(*ports))) 214 + return false; 215 + 216 + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); 217 + 218 + tuple->src_v4.s_addr = iph->saddr; 219 + tuple->dst_v4.s_addr = iph->daddr; 220 + tuple->src_port = ports->source; 221 + tuple->dst_port = ports->dest; 222 + tuple->l3proto = AF_INET; 223 + tuple->l4proto = iph->protocol; 224 + 225 + return true; 226 + } 227 + 228 + static bool 229 + tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb, 230 + struct flow_offload_tuple *tuple) 231 + { 232 + struct flow_ports *ports; 233 + struct ipv6hdr *ip6h; 234 + unsigned int thoff; 235 + 236 + if (!pskb_may_pull(skb, sizeof(*ip6h))) 237 + return false; 238 + 239 + ip6h = ipv6_hdr(skb); 240 + 241 + if (ip6h->nexthdr != IPPROTO_TCP && 242 + ip6h->nexthdr != IPPROTO_UDP) 243 + return false; 244 + 245 + if (ip6h->hop_limit <= 1) 246 + return false; 247 + 248 + thoff = sizeof(*ip6h); 249 + if (!pskb_may_pull(skb, thoff + sizeof(*ports))) 250 + return false; 251 + 252 + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); 253 + 254 + tuple->src_v6 = ip6h->saddr; 255 + tuple->dst_v6 = ip6h->daddr; 256 + tuple->src_port = ports->source; 257 + tuple->dst_port = ports->dest; 258 + tuple->l3proto = AF_INET6; 259 + tuple->l4proto = ip6h->nexthdr; 260 + 261 + return true; 262 + } 263 + 264 + static bool tcf_ct_flow_table_check_tcp(struct flow_offload *flow, 265 + struct sk_buff *skb, 266 + unsigned int thoff) 267 + { 268 + struct tcphdr *tcph; 269 + 270 + if (!pskb_may_pull(skb, thoff + sizeof(*tcph))) 271 + return false; 272 + 273 + tcph = (void *)(skb_network_header(skb) + thoff); 274 + if (unlikely(tcph->fin || tcph->rst)) { 275 + flow_offload_teardown(flow); 276 + return false; 277 + } 278 + 279 + return true; 280 + } 281 + 282 + static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, 283 + struct sk_buff *skb, 284 + u8 family) 285 + { 286 + struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft; 287 + struct flow_offload_tuple_rhash *tuplehash; 288 + struct flow_offload_tuple tuple = {}; 289 + enum ip_conntrack_info ctinfo; 290 + struct flow_offload *flow; 291 + struct nf_conn *ct; 292 + unsigned int thoff; 293 + int ip_proto; 294 + u8 dir; 295 + 296 + /* Previously seen or loopback */ 297 + ct = nf_ct_get(skb, &ctinfo); 298 + if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED) 299 + return false; 300 + 301 + switch (family) { 302 + case NFPROTO_IPV4: 303 + if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple)) 304 + return false; 305 + break; 306 + case NFPROTO_IPV6: 307 + if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple)) 308 + return false; 309 + break; 310 + default: 311 + return false; 312 + } 313 + 314 + tuplehash = flow_offload_lookup(nf_ft, &tuple); 315 + if (!tuplehash) 316 + return false; 317 + 318 + dir = tuplehash->tuple.dir; 319 + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); 320 + ct = flow->ct; 321 + 322 + ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED : 323 + IP_CT_ESTABLISHED_REPLY; 324 + 325 + thoff = ip_hdr(skb)->ihl * 4; 326 + ip_proto = ip_hdr(skb)->protocol; 327 + if (ip_proto == IPPROTO_TCP && 328 + !tcf_ct_flow_table_check_tcp(flow, skb, thoff)) 329 + return false; 330 + 331 + nf_conntrack_get(&ct->ct_general); 332 + nf_ct_set(skb, ct, ctinfo); 333 + 334 + return true; 335 + } 336 + 337 + static int tcf_ct_flow_tables_init(void) 338 + { 339 + return rhashtable_init(&zones_ht, &zones_params); 340 + } 341 + 342 + static void tcf_ct_flow_tables_uninit(void) 343 + { 344 + rhashtable_destroy(&zones_ht); 345 + } 34 346 35 347 static struct tc_action_ops act_ct_ops; 36 348 static unsigned int ct_net_id; ··· 520 206 { 521 207 struct tcf_ct_params *params = container_of(head, 522 208 struct tcf_ct_params, rcu); 209 + 210 + tcf_ct_flow_table_put(params); 523 211 524 212 if (params->tmpl) 525 213 nf_conntrack_put(&params->tmpl->ct_general); ··· 703 387 struct nf_hook_state state; 704 388 int nh_ofs, err, retval; 705 389 struct tcf_ct_params *p; 390 + bool skip_add = false; 706 391 struct nf_conn *ct; 707 392 u8 family; 708 393 ··· 753 436 */ 754 437 cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force); 755 438 if (!cached) { 439 + if (!commit && tcf_ct_flow_table_lookup(p, skb, family)) { 440 + skip_add = true; 441 + goto do_nat; 442 + } 443 + 756 444 /* Associate skb with specified zone. */ 757 445 if (tmpl) { 758 446 ct = nf_ct_get(skb, &ctinfo); ··· 775 453 goto out_push; 776 454 } 777 455 456 + do_nat: 778 457 ct = nf_ct_get(skb, &ctinfo); 779 458 if (!ct) 780 459 goto out_push; ··· 793 470 * even if the connection is already confirmed. 794 471 */ 795 472 nf_conntrack_confirm(skb); 473 + } else if (!skip_add) { 474 + tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo); 796 475 } 797 476 798 477 out_push: ··· 1055 730 if (err) 1056 731 goto cleanup; 1057 732 733 + err = tcf_ct_flow_table_get(params); 734 + if (err) 735 + goto cleanup; 736 + 1058 737 spin_lock_bh(&c->tcf_lock); 1059 738 goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); 1060 739 params = rcu_replace_pointer(c->params, params, ··· 1303 974 1304 975 static int __init ct_init_module(void) 1305 976 { 1306 - return tcf_register_action(&act_ct_ops, &ct_net_ops); 977 + int err; 978 + 979 + act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0); 980 + if (!act_ct_wq) 981 + return -ENOMEM; 982 + 983 + err = tcf_ct_flow_tables_init(); 984 + if (err) 985 + goto err_tbl_init; 986 + 987 + err = tcf_register_action(&act_ct_ops, &ct_net_ops); 988 + if (err) 989 + goto err_register; 990 + 991 + return 0; 992 + 993 + err_tbl_init: 994 + destroy_workqueue(act_ct_wq); 995 + err_register: 996 + tcf_ct_flow_tables_uninit(); 997 + return err; 1307 998 } 1308 999 1309 1000 static void __exit ct_cleanup_module(void) 1310 1001 { 1311 1002 tcf_unregister_action(&act_ct_ops, &ct_net_ops); 1003 + tcf_ct_flow_tables_uninit(); 1004 + destroy_workqueue(act_ct_wq); 1312 1005 } 1313 1006 1314 1007 module_init(ct_init_module);