Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: push reasm skb through instead of original frag skbs

Pushing original fragments through causes several problems. For example
for matching, frags may not be matched correctly. Take following
example:

<example>
On HOSTA do:
ip6tables -I INPUT -p icmpv6 -j DROP
ip6tables -I INPUT -p icmpv6 -m icmp6 --icmpv6-type 128 -j ACCEPT

and on HOSTB you do:
ping6 HOSTA -s2000 (MTU is 1500)

Incoming echo requests will be filtered out on HOSTA. This issue does
not occur with smaller packets than MTU (where fragmentation does not happen)
</example>

As was discussed previously, the only correct solution seems to be to use
reassembled skb instead of separete frags. Doing this has positive side
effects in reducing sk_buff by one pointer (nfct_reasm) and also the reams
dances in ipvs and conntrack can be removed.

Future plan is to remove net/ipv6/netfilter/nf_conntrack_reasm.c
entirely and use code in net/ipv6/reassembly.c instead.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Marcelo Ricardo Leitner <mleitner@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Jiri Pirko and committed by
David S. Miller
6aafeef0 9037c357

+13 -203
-32
include/linux/skbuff.h
··· 337 337 typedef unsigned char *sk_buff_data_t; 338 338 #endif 339 339 340 - #if defined(CONFIG_NF_DEFRAG_IPV4) || defined(CONFIG_NF_DEFRAG_IPV4_MODULE) || \ 341 - defined(CONFIG_NF_DEFRAG_IPV6) || defined(CONFIG_NF_DEFRAG_IPV6_MODULE) 342 - #define NET_SKBUFF_NF_DEFRAG_NEEDED 1 343 - #endif 344 - 345 340 /** 346 341 * struct sk_buff - socket buffer 347 342 * @next: Next buffer in list ··· 369 374 * @protocol: Packet protocol from driver 370 375 * @destructor: Destruct function 371 376 * @nfct: Associated connection, if any 372 - * @nfct_reasm: netfilter conntrack re-assembly pointer 373 377 * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c 374 378 * @skb_iif: ifindex of device we arrived on 375 379 * @tc_index: Traffic control index ··· 456 462 void (*destructor)(struct sk_buff *skb); 457 463 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 458 464 struct nf_conntrack *nfct; 459 - #endif 460 - #ifdef NET_SKBUFF_NF_DEFRAG_NEEDED 461 - struct sk_buff *nfct_reasm; 462 465 #endif 463 466 #ifdef CONFIG_BRIDGE_NETFILTER 464 467 struct nf_bridge_info *nf_bridge; ··· 2586 2595 atomic_inc(&nfct->use); 2587 2596 } 2588 2597 #endif 2589 - #ifdef NET_SKBUFF_NF_DEFRAG_NEEDED 2590 - static inline void nf_conntrack_get_reasm(struct sk_buff *skb) 2591 - { 2592 - if (skb) 2593 - atomic_inc(&skb->users); 2594 - } 2595 - static inline void nf_conntrack_put_reasm(struct sk_buff *skb) 2596 - { 2597 - if (skb) 2598 - kfree_skb(skb); 2599 - } 2600 - #endif 2601 2598 #ifdef CONFIG_BRIDGE_NETFILTER 2602 2599 static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge) 2603 2600 { ··· 2603 2624 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 2604 2625 nf_conntrack_put(skb->nfct); 2605 2626 skb->nfct = NULL; 2606 - #endif 2607 - #ifdef NET_SKBUFF_NF_DEFRAG_NEEDED 2608 - nf_conntrack_put_reasm(skb->nfct_reasm); 2609 - skb->nfct_reasm = NULL; 2610 2627 #endif 2611 2628 #ifdef CONFIG_BRIDGE_NETFILTER 2612 2629 nf_bridge_put(skb->nf_bridge); ··· 2625 2650 nf_conntrack_get(src->nfct); 2626 2651 dst->nfctinfo = src->nfctinfo; 2627 2652 #endif 2628 - #ifdef NET_SKBUFF_NF_DEFRAG_NEEDED 2629 - dst->nfct_reasm = src->nfct_reasm; 2630 - nf_conntrack_get_reasm(src->nfct_reasm); 2631 - #endif 2632 2653 #ifdef CONFIG_BRIDGE_NETFILTER 2633 2654 dst->nf_bridge = src->nf_bridge; 2634 2655 nf_bridge_get(src->nf_bridge); ··· 2635 2664 { 2636 2665 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 2637 2666 nf_conntrack_put(dst->nfct); 2638 - #endif 2639 - #ifdef NET_SKBUFF_NF_DEFRAG_NEEDED 2640 - nf_conntrack_put_reasm(dst->nfct_reasm); 2641 2667 #endif 2642 2668 #ifdef CONFIG_BRIDGE_NETFILTER 2643 2669 nf_bridge_put(dst->nf_bridge);
+1 -31
include/net/ip_vs.h
··· 109 109 struct ip_vs_iphdr { 110 110 __u32 len; /* IPv4 simply where L4 starts 111 111 IPv6 where L4 Transport Header starts */ 112 - __u32 thoff_reasm; /* Transport Header Offset in nfct_reasm skb */ 113 112 __u16 fragoffs; /* IPv6 fragment offset, 0 if first frag (or not frag)*/ 114 113 __s16 protocol; 115 114 __s32 flags; ··· 116 117 union nf_inet_addr daddr; 117 118 }; 118 119 119 - /* Dependency to module: nf_defrag_ipv6 */ 120 - #if defined(CONFIG_NF_DEFRAG_IPV6) || defined(CONFIG_NF_DEFRAG_IPV6_MODULE) 121 - static inline struct sk_buff *skb_nfct_reasm(const struct sk_buff *skb) 122 - { 123 - return skb->nfct_reasm; 124 - } 125 - static inline void *frag_safe_skb_hp(const struct sk_buff *skb, int offset, 126 - int len, void *buffer, 127 - const struct ip_vs_iphdr *ipvsh) 128 - { 129 - if (unlikely(ipvsh->fragoffs && skb_nfct_reasm(skb))) 130 - return skb_header_pointer(skb_nfct_reasm(skb), 131 - ipvsh->thoff_reasm, len, buffer); 132 - 133 - return skb_header_pointer(skb, offset, len, buffer); 134 - } 135 - #else 136 - static inline struct sk_buff *skb_nfct_reasm(const struct sk_buff *skb) 137 - { 138 - return NULL; 139 - } 140 120 static inline void *frag_safe_skb_hp(const struct sk_buff *skb, int offset, 141 121 int len, void *buffer, 142 122 const struct ip_vs_iphdr *ipvsh) 143 123 { 144 124 return skb_header_pointer(skb, offset, len, buffer); 145 125 } 146 - #endif 147 126 148 127 static inline void 149 128 ip_vs_fill_ip4hdr(const void *nh, struct ip_vs_iphdr *iphdr) ··· 148 171 (struct ipv6hdr *)skb_network_header(skb); 149 172 iphdr->saddr.in6 = iph->saddr; 150 173 iphdr->daddr.in6 = iph->daddr; 151 - /* ipv6_find_hdr() updates len, flags, thoff_reasm */ 152 - iphdr->thoff_reasm = 0; 174 + /* ipv6_find_hdr() updates len, flags */ 153 175 iphdr->len = 0; 154 176 iphdr->flags = 0; 155 177 iphdr->protocol = ipv6_find_hdr(skb, &iphdr->len, -1, 156 178 &iphdr->fragoffs, 157 179 &iphdr->flags); 158 - /* get proto from re-assembled packet and it's offset */ 159 - if (skb_nfct_reasm(skb)) 160 - iphdr->protocol = ipv6_find_hdr(skb_nfct_reasm(skb), 161 - &iphdr->thoff_reasm, 162 - -1, NULL, NULL); 163 - 164 180 } else 165 181 #endif 166 182 {
+1 -3
include/net/netfilter/ipv6/nf_defrag_ipv6.h
··· 6 6 int nf_ct_frag6_init(void); 7 7 void nf_ct_frag6_cleanup(void); 8 8 struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user); 9 - void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, 10 - struct net_device *in, struct net_device *out, 11 - int (*okfn)(struct sk_buff *)); 9 + void nf_ct_frag6_consume_orig(struct sk_buff *skb); 12 10 13 11 struct inet_frags_ctl; 14 12
-3
net/core/skbuff.c
··· 592 592 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 593 593 nf_conntrack_put(skb->nfct); 594 594 #endif 595 - #ifdef NET_SKBUFF_NF_DEFRAG_NEEDED 596 - nf_conntrack_put_reasm(skb->nfct_reasm); 597 - #endif 598 595 #ifdef CONFIG_BRIDGE_NETFILTER 599 596 nf_bridge_put(skb->nf_bridge); 600 597 #endif
+2 -54
net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
··· 169 169 return nf_conntrack_confirm(skb); 170 170 } 171 171 172 - static unsigned int __ipv6_conntrack_in(struct net *net, 173 - unsigned int hooknum, 174 - struct sk_buff *skb, 175 - const struct net_device *in, 176 - const struct net_device *out, 177 - int (*okfn)(struct sk_buff *)) 178 - { 179 - struct sk_buff *reasm = skb->nfct_reasm; 180 - const struct nf_conn_help *help; 181 - struct nf_conn *ct; 182 - enum ip_conntrack_info ctinfo; 183 - 184 - /* This packet is fragmented and has reassembled packet. */ 185 - if (reasm) { 186 - /* Reassembled packet isn't parsed yet ? */ 187 - if (!reasm->nfct) { 188 - unsigned int ret; 189 - 190 - ret = nf_conntrack_in(net, PF_INET6, hooknum, reasm); 191 - if (ret != NF_ACCEPT) 192 - return ret; 193 - } 194 - 195 - /* Conntrack helpers need the entire reassembled packet in the 196 - * POST_ROUTING hook. In case of unconfirmed connections NAT 197 - * might reassign a helper, so the entire packet is also 198 - * required. 199 - */ 200 - ct = nf_ct_get(reasm, &ctinfo); 201 - if (ct != NULL && !nf_ct_is_untracked(ct)) { 202 - help = nfct_help(ct); 203 - if ((help && help->helper) || !nf_ct_is_confirmed(ct)) { 204 - nf_conntrack_get_reasm(reasm); 205 - NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, reasm, 206 - (struct net_device *)in, 207 - (struct net_device *)out, 208 - okfn, NF_IP6_PRI_CONNTRACK + 1); 209 - return NF_DROP_ERR(-ECANCELED); 210 - } 211 - } 212 - 213 - nf_conntrack_get(reasm->nfct); 214 - skb->nfct = reasm->nfct; 215 - skb->nfctinfo = reasm->nfctinfo; 216 - return NF_ACCEPT; 217 - } 218 - 219 - return nf_conntrack_in(net, PF_INET6, hooknum, skb); 220 - } 221 - 222 172 static unsigned int ipv6_conntrack_in(const struct nf_hook_ops *ops, 223 173 struct sk_buff *skb, 224 174 const struct net_device *in, 225 175 const struct net_device *out, 226 176 int (*okfn)(struct sk_buff *)) 227 177 { 228 - return __ipv6_conntrack_in(dev_net(in), ops->hooknum, skb, in, out, 229 - okfn); 178 + return nf_conntrack_in(dev_net(in), PF_INET6, ops->hooknum, skb); 230 179 } 231 180 232 181 static unsigned int ipv6_conntrack_local(const struct nf_hook_ops *ops, ··· 189 240 net_notice_ratelimited("ipv6_conntrack_local: packet too short\n"); 190 241 return NF_ACCEPT; 191 242 } 192 - return __ipv6_conntrack_in(dev_net(out), ops->hooknum, skb, in, out, 193 - okfn); 243 + return nf_conntrack_in(dev_net(out), PF_INET6, ops->hooknum, skb); 194 244 } 195 245 196 246 static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = {
+2 -17
net/ipv6/netfilter/nf_conntrack_reasm.c
··· 633 633 return skb; 634 634 } 635 635 636 - void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, 637 - struct net_device *in, struct net_device *out, 638 - int (*okfn)(struct sk_buff *)) 636 + void nf_ct_frag6_consume_orig(struct sk_buff *skb) 639 637 { 640 638 struct sk_buff *s, *s2; 641 - unsigned int ret = 0; 642 639 643 640 for (s = NFCT_FRAG6_CB(skb)->orig; s;) { 644 - nf_conntrack_put_reasm(s->nfct_reasm); 645 - nf_conntrack_get_reasm(skb); 646 - s->nfct_reasm = skb; 647 - 648 641 s2 = s->next; 649 642 s->next = NULL; 650 - 651 - if (ret != -ECANCELED) 652 - ret = NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, s, 653 - in, out, okfn, 654 - NF_IP6_PRI_CONNTRACK_DEFRAG + 1); 655 - else 656 - kfree_skb(s); 657 - 643 + consume_skb(s); 658 644 s = s2; 659 645 } 660 - nf_conntrack_put_reasm(skb); 661 646 } 662 647 663 648 static int nf_ct_net_init(struct net *net)
+5 -2
net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
··· 75 75 if (reasm == skb) 76 76 return NF_ACCEPT; 77 77 78 - nf_ct_frag6_output(ops->hooknum, reasm, (struct net_device *)in, 79 - (struct net_device *)out, okfn); 78 + nf_ct_frag6_consume_orig(reasm); 79 + 80 + NF_HOOK_THRESH(NFPROTO_IPV6, ops->hooknum, reasm, 81 + (struct net_device *) in, (struct net_device *) out, 82 + okfn, NF_IP6_PRI_CONNTRACK_DEFRAG + 1); 80 83 81 84 return NF_STOLEN; 82 85 }
+1 -54
net/netfilter/ipvs/ip_vs_core.c
··· 1139 1139 ip_vs_fill_iph_skb(af, skb, &iph); 1140 1140 #ifdef CONFIG_IP_VS_IPV6 1141 1141 if (af == AF_INET6) { 1142 - if (!iph.fragoffs && skb_nfct_reasm(skb)) { 1143 - struct sk_buff *reasm = skb_nfct_reasm(skb); 1144 - /* Save fw mark for coming frags */ 1145 - reasm->ipvs_property = 1; 1146 - reasm->mark = skb->mark; 1147 - } 1148 1142 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { 1149 1143 int related; 1150 1144 int verdict = ip_vs_out_icmp_v6(skb, &related, ··· 1608 1614 1609 1615 #ifdef CONFIG_IP_VS_IPV6 1610 1616 if (af == AF_INET6) { 1611 - if (!iph.fragoffs && skb_nfct_reasm(skb)) { 1612 - struct sk_buff *reasm = skb_nfct_reasm(skb); 1613 - /* Save fw mark for coming frags. */ 1614 - reasm->ipvs_property = 1; 1615 - reasm->mark = skb->mark; 1616 - } 1617 1617 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { 1618 1618 int related; 1619 1619 int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum, ··· 1659 1671 /* sorry, all this trouble for a no-hit :) */ 1660 1672 IP_VS_DBG_PKT(12, af, pp, skb, 0, 1661 1673 "ip_vs_in: packet continues traversal as normal"); 1662 - if (iph.fragoffs && !skb_nfct_reasm(skb)) { 1674 + if (iph.fragoffs) { 1663 1675 /* Fragment that couldn't be mapped to a conn entry 1664 - * and don't have any pointer to a reasm skb 1665 1676 * is missing module nf_defrag_ipv6 1666 1677 */ 1667 1678 IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n"); ··· 1741 1754 } 1742 1755 1743 1756 #ifdef CONFIG_IP_VS_IPV6 1744 - 1745 - /* 1746 - * AF_INET6 fragment handling 1747 - * Copy info from first fragment, to the rest of them. 1748 - */ 1749 - static unsigned int 1750 - ip_vs_preroute_frag6(const struct nf_hook_ops *ops, struct sk_buff *skb, 1751 - const struct net_device *in, 1752 - const struct net_device *out, 1753 - int (*okfn)(struct sk_buff *)) 1754 - { 1755 - struct sk_buff *reasm = skb_nfct_reasm(skb); 1756 - struct net *net; 1757 - 1758 - /* Skip if not a "replay" from nf_ct_frag6_output or first fragment. 1759 - * ipvs_property is set when checking first fragment 1760 - * in ip_vs_in() and ip_vs_out(). 1761 - */ 1762 - if (reasm) 1763 - IP_VS_DBG(2, "Fragment recv prop:%d\n", reasm->ipvs_property); 1764 - if (!reasm || !reasm->ipvs_property) 1765 - return NF_ACCEPT; 1766 - 1767 - net = skb_net(skb); 1768 - if (!net_ipvs(net)->enable) 1769 - return NF_ACCEPT; 1770 - 1771 - /* Copy stored fw mark, saved in ip_vs_{in,out} */ 1772 - skb->mark = reasm->mark; 1773 - 1774 - return NF_ACCEPT; 1775 - } 1776 1757 1777 1758 /* 1778 1759 * AF_INET6 handler in NF_INET_LOCAL_IN chain ··· 1879 1924 .priority = 100, 1880 1925 }, 1881 1926 #ifdef CONFIG_IP_VS_IPV6 1882 - /* After mangle & nat fetch 2:nd fragment and following */ 1883 - { 1884 - .hook = ip_vs_preroute_frag6, 1885 - .owner = THIS_MODULE, 1886 - .pf = NFPROTO_IPV6, 1887 - .hooknum = NF_INET_PRE_ROUTING, 1888 - .priority = NF_IP6_PRI_NAT_DST + 1, 1889 - }, 1890 1927 /* After packet filtering, change source only for VS/NAT */ 1891 1928 { 1892 1929 .hook = ip_vs_reply6,
+1 -7
net/netfilter/ipvs/ip_vs_pe_sip.c
··· 65 65 static int 66 66 ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb) 67 67 { 68 - struct sk_buff *reasm = skb_nfct_reasm(skb); 69 68 struct ip_vs_iphdr iph; 70 69 unsigned int dataoff, datalen, matchoff, matchlen; 71 70 const char *dptr; ··· 78 79 /* todo: IPv6 fragments: 79 80 * I think this only should be done for the first fragment. /HS 80 81 */ 81 - if (reasm) { 82 - skb = reasm; 83 - dataoff = iph.thoff_reasm + sizeof(struct udphdr); 84 - } else 85 - dataoff = iph.len + sizeof(struct udphdr); 82 + dataoff = iph.len + sizeof(struct udphdr); 86 83 87 84 if (dataoff >= skb->len) 88 85 return -EINVAL; 89 - /* todo: Check if this will mess-up the reasm skb !!! /HS */ 90 86 retc = skb_linearize(skb); 91 87 if (retc < 0) 92 88 return retc;