Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: bridge: forward IPv6 fragmented packets

IPv6 fragmented packets are not forwarded on an ethernet bridge
with netfilter ip6_tables loaded. e.g. steps to reproduce

1) create a simple bridge like this

modprobe br_netfilter
brctl addbr br0
brctl addif br0 eth0
brctl addif br0 eth2
ifconfig eth0 up
ifconfig eth2 up
ifconfig br0 up

2) place a host with an IPv6 address on each side of the bridge

set IPv6 address on host A:
ip -6 addr add fd01:2345:6789:1::1/64 dev eth0

set IPv6 address on host B:
ip -6 addr add fd01:2345:6789:1::2/64 dev eth0

3) run a simple ping command on host A with packets > MTU

ping6 -s 4000 fd01:2345:6789:1::2

4) wait some time and run e.g. "ip6tables -t nat -nvL" on the bridge

IPv6 fragmented packets traverse the bridge cleanly until somebody runs.
"ip6tables -t nat -nvL". As soon as it is run (and netfilter modules are
loaded) IPv6 fragmented packets do not traverse the bridge any more (you
see no more responses in ping's output).

After applying this patch IPv6 fragmented packets traverse the bridge
cleanly in above scenario.

Signed-off-by: Bernhard Thaler <bernhard.thaler@wvnet.at>
[pablo@netfilter.org: small changes to br_nf_dev_queue_xmit]
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

authored by

Bernhard Thaler and committed by
Pablo Neira Ayuso
efb6de9b a4611d3b

+108 -42
+2
include/linux/netfilter_ipv6.h
··· 26 26 int (*chk_addr)(struct net *net, const struct in6_addr *addr, 27 27 const struct net_device *dev, int strict); 28 28 void (*route_input)(struct sk_buff *skb); 29 + int (*fragment)(struct sock *sk, struct sk_buff *skb, 30 + int (*output)(struct sock *, struct sk_buff *)); 29 31 }; 30 32 31 33 extern const struct nf_ipv6_ops __rcu *nf_ipv6_ops;
+99 -40
net/bridge/br_netfilter.c
··· 34 34 35 35 #include <net/ip.h> 36 36 #include <net/ipv6.h> 37 + #include <net/addrconf.h> 37 38 #include <net/route.h> 38 39 #include <net/netfilter/br_netfilter.h> 39 40 ··· 321 320 return -1; 322 321 } 323 322 323 + /* Equivalent to br_validate_ipv4 for IPv6 */ 324 + static int br_validate_ipv6(struct sk_buff *skb) 325 + { 326 + const struct ipv6hdr *hdr; 327 + struct net_device *dev = skb->dev; 328 + struct inet6_dev *idev = in6_dev_get(skb->dev); 329 + u32 pkt_len; 330 + u8 ip6h_len = sizeof(struct ipv6hdr); 331 + 332 + if (!pskb_may_pull(skb, ip6h_len)) 333 + goto inhdr_error; 334 + 335 + if (skb->len < ip6h_len) 336 + goto drop; 337 + 338 + hdr = ipv6_hdr(skb); 339 + 340 + if (hdr->version != 6) 341 + goto inhdr_error; 342 + 343 + pkt_len = ntohs(hdr->payload_len); 344 + 345 + if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { 346 + if (pkt_len + ip6h_len > skb->len) { 347 + IP6_INC_STATS_BH(dev_net(dev), idev, 348 + IPSTATS_MIB_INTRUNCATEDPKTS); 349 + goto drop; 350 + } 351 + if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) { 352 + IP6_INC_STATS_BH(dev_net(dev), idev, 353 + IPSTATS_MIB_INDISCARDS); 354 + goto drop; 355 + } 356 + } 357 + if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) 358 + goto drop; 359 + 360 + memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); 361 + /* No IP options in IPv6 header; however it should be 362 + * checked if some next headers need special treatment 363 + */ 364 + return 0; 365 + 366 + inhdr_error: 367 + IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INHDRERRORS); 368 + drop: 369 + return -1; 370 + } 371 + 324 372 static void nf_bridge_update_protocol(struct sk_buff *skb) 325 373 { 326 374 switch (skb->nf_bridge->orig_proto) { ··· 454 404 struct rtable *rt; 455 405 struct net_device *dev = skb->dev; 456 406 const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); 407 + 408 + nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; 457 409 458 410 if (nf_bridge->pkt_otherhost) { 459 411 skb->pkt_type = PACKET_OTHERHOST; ··· 658 606 } 659 607 660 608 /* Replicate the checks that IPv6 does on packet reception and pass the packet 661 - * to ip6tables, which doesn't support NAT, so things are fairly simple. */ 609 + * to ip6tables. 610 + */ 662 611 static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops, 663 612 struct sk_buff *skb, 664 613 const struct nf_hook_state *state) 665 614 { 666 615 struct nf_bridge_info *nf_bridge; 667 - const struct ipv6hdr *hdr; 668 - u32 pkt_len; 669 616 670 - if (skb->len < sizeof(struct ipv6hdr)) 671 - return NF_DROP; 672 - 673 - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) 674 - return NF_DROP; 675 - 676 - hdr = ipv6_hdr(skb); 677 - 678 - if (hdr->version != 6) 679 - return NF_DROP; 680 - 681 - pkt_len = ntohs(hdr->payload_len); 682 - 683 - if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { 684 - if (pkt_len + sizeof(struct ipv6hdr) > skb->len) 685 - return NF_DROP; 686 - if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) 687 - return NF_DROP; 688 - } 689 - if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) 617 + if (br_validate_ipv6(skb)) 690 618 return NF_DROP; 691 619 692 620 nf_bridge_put(skb->nf_bridge); ··· 770 738 771 739 if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) { 772 740 773 - if (skb->protocol == htons(ETH_P_IP)) { 741 + if (skb->protocol == htons(ETH_P_IP)) 774 742 nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; 775 - } 743 + 744 + if (skb->protocol == htons(ETH_P_IPV6)) 745 + nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; 776 746 777 747 in = nf_bridge->physindev; 778 748 if (nf_bridge->pkt_otherhost) { ··· 842 808 IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; 843 809 } 844 810 811 + if (pf == NFPROTO_IPV6) { 812 + if (br_validate_ipv6(skb)) 813 + return NF_DROP; 814 + IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; 815 + } 816 + 845 817 nf_bridge->physoutdev = skb->dev; 846 818 if (pf == NFPROTO_IPV4) 847 819 skb->protocol = htons(ETH_P_IP); ··· 895 855 return NF_STOLEN; 896 856 } 897 857 898 - #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) 858 + #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) 899 859 static int br_nf_push_frag_xmit(struct sock *sk, struct sk_buff *skb) 900 860 { 901 861 struct brnf_frag_data *data; ··· 915 875 nf_bridge_info_free(skb); 916 876 return br_dev_queue_push_xmit(sk, skb); 917 877 } 878 + #endif 918 879 919 880 static int br_nf_ip_fragment(struct sock *sk, struct sk_buff *skb, 920 881 int (*output)(struct sock *, struct sk_buff *)) ··· 938 897 939 898 static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) 940 899 { 941 - int ret; 942 900 struct nf_bridge_info *nf_bridge; 943 901 unsigned int mtu_reserved; 944 902 945 - if (skb_is_gso(skb) || skb->protocol != htons(ETH_P_IP)) { 903 + mtu_reserved = nf_bridge_mtu_reduction(skb); 904 + 905 + if (skb_is_gso(skb) || skb->len + mtu_reserved <= skb->dev->mtu) { 946 906 nf_bridge_info_free(skb); 947 907 return br_dev_queue_push_xmit(sk, skb); 948 908 } 949 909 950 - mtu_reserved = nf_bridge_mtu_reduction(skb); 951 910 nf_bridge = nf_bridge_info_get(skb); 911 + 912 + #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) 952 913 /* This is wrong! We should preserve the original fragment 953 914 * boundaries by preserving frag_list rather than refragmenting. 954 915 */ 955 - if (skb->len + mtu_reserved > skb->dev->mtu) { 916 + if (skb->protocol == htons(ETH_P_IP)) { 956 917 struct brnf_frag_data *data; 957 918 958 919 if (br_validate_ipv4(skb)) ··· 971 928 skb_copy_from_linear_data_offset(skb, -data->size, data->mac, 972 929 data->size); 973 930 974 - ret = br_nf_ip_fragment(sk, skb, br_nf_push_frag_xmit); 975 - } else { 976 - nf_bridge_info_free(skb); 977 - ret = br_dev_queue_push_xmit(sk, skb); 931 + return br_nf_ip_fragment(sk, skb, br_nf_push_frag_xmit); 978 932 } 933 + #endif 934 + #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) 935 + if (skb->protocol == htons(ETH_P_IPV6)) { 936 + const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); 937 + struct brnf_frag_data *data; 979 938 980 - return ret; 981 - } 982 - #else 983 - static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) 984 - { 939 + if (br_validate_ipv6(skb)) 940 + return NF_DROP; 941 + 942 + IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; 943 + 944 + nf_bridge_update_protocol(skb); 945 + 946 + data = this_cpu_ptr(&brnf_frag_data_storage); 947 + data->encap_size = nf_bridge_encap_header_len(skb); 948 + data->size = ETH_HLEN + data->encap_size; 949 + 950 + skb_copy_from_linear_data_offset(skb, -data->size, data->mac, 951 + data->size); 952 + 953 + if (v6ops) 954 + return v6ops->fragment(sk, skb, br_nf_push_frag_xmit); 955 + else 956 + return -EMSGSIZE; 957 + } 958 + #endif 985 959 nf_bridge_info_free(skb); 986 960 return br_dev_queue_push_xmit(sk, skb); 987 961 } 988 - #endif 989 962 990 963 /* PF_BRIDGE/POST_ROUTING ********************************************/ 991 964 static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops,
+5 -1
net/bridge/br_private.h
··· 18 18 #include <linux/netpoll.h> 19 19 #include <linux/u64_stats_sync.h> 20 20 #include <net/route.h> 21 + #include <net/ip6_fib.h> 21 22 #include <linux/if_vlan.h> 22 23 23 24 #define BR_HASH_BITS 8 ··· 215 214 spinlock_t hash_lock; 216 215 struct hlist_head hash[BR_HASH_SIZE]; 217 216 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 218 - struct rtable fake_rtable; 217 + union { 218 + struct rtable fake_rtable; 219 + struct rt6_info fake_rt6_info; 220 + }; 219 221 bool nf_call_iptables; 220 222 bool nf_call_ip6tables; 221 223 bool nf_call_arptables;
+2 -1
net/ipv6/netfilter.c
··· 191 191 192 192 static const struct nf_ipv6_ops ipv6ops = { 193 193 .chk_addr = ipv6_chk_addr, 194 - .route_input = ip6_route_input 194 + .route_input = ip6_route_input, 195 + .fragment = ip6_fragment 195 196 }; 196 197 197 198 static const struct nf_afinfo nf_ip6_afinfo = {