Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: lwtunnel: Handle fragmentation

Today mpls iptunnel lwtunnel_output redirect expects the tunnel
output function to handle fragmentation. This is ok but can be
avoided if we did not do the mpls output redirect too early.
ie we could wait until ip fragmentation is done and then call
mpls output for each ip fragment.

To make this work we will need,
1) the lwtunnel state to carry encap headroom
2) and do the redirect to the encap output handler on the ip fragment
(essentially do the output redirect after fragmentation)

This patch adds tunnel headroom in lwtstate to make sure we
account for tunnel data in mtu calculations during fragmentation
and adds new xmit redirect handler to redirect to lwtunnel xmit func
after ip fragmentation.

This includes IPV6 and some mtu fixes and testing from David Ahern.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Roopa Prabhu and committed by
David S. Miller
14972cbd 41852497

+106 -6
+44
include/net/lwtunnel.h
··· 13 13 /* lw tunnel state flags */ 14 14 #define LWTUNNEL_STATE_OUTPUT_REDIRECT BIT(0) 15 15 #define LWTUNNEL_STATE_INPUT_REDIRECT BIT(1) 16 + #define LWTUNNEL_STATE_XMIT_REDIRECT BIT(2) 17 + 18 + enum { 19 + LWTUNNEL_XMIT_DONE, 20 + LWTUNNEL_XMIT_CONTINUE, 21 + }; 22 + 16 23 17 24 struct lwtunnel_state { 18 25 __u16 type; ··· 28 21 int (*orig_output)(struct net *net, struct sock *sk, struct sk_buff *skb); 29 22 int (*orig_input)(struct sk_buff *); 30 23 int len; 24 + __u16 headroom; 31 25 __u8 data[0]; 32 26 }; 33 27 ··· 42 34 struct lwtunnel_state *lwtstate); 43 35 int (*get_encap_size)(struct lwtunnel_state *lwtstate); 44 36 int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b); 37 + int (*xmit)(struct sk_buff *skb); 45 38 }; 46 39 47 40 #ifdef CONFIG_LWTUNNEL ··· 84 75 85 76 return false; 86 77 } 78 + 79 + static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate) 80 + { 81 + if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_XMIT_REDIRECT)) 82 + return true; 83 + 84 + return false; 85 + } 86 + 87 + static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate, 88 + unsigned int mtu) 89 + { 90 + if (lwtunnel_xmit_redirect(lwtstate) && lwtstate->headroom < mtu) 91 + return lwtstate->headroom; 92 + 93 + return 0; 94 + } 95 + 87 96 int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, 88 97 unsigned int num); 89 98 int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, ··· 117 90 int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); 118 91 int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb); 119 92 int lwtunnel_input(struct sk_buff *skb); 93 + int lwtunnel_xmit(struct sk_buff *skb); 120 94 121 95 #else 122 96 ··· 143 115 static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate) 144 116 { 145 117 return false; 118 + } 119 + 120 + static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate) 121 + { 122 + return false; 123 + } 124 + 125 + static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate, 126 + unsigned int mtu) 127 + { 128 + return 0; 146 129 } 147 130 148 131 static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, ··· 205 166 } 206 167 207 168 static inline int lwtunnel_input(struct sk_buff *skb) 169 + { 170 + return -EOPNOTSUPP; 171 + } 172 + 173 + static inline int lwtunnel_xmit(struct sk_buff *skb) 208 174 { 209 175 return -EOPNOTSUPP; 210 176 }
+35
net/core/lwtunnel.c
··· 251 251 } 252 252 EXPORT_SYMBOL(lwtunnel_output); 253 253 254 + int lwtunnel_xmit(struct sk_buff *skb) 255 + { 256 + struct dst_entry *dst = skb_dst(skb); 257 + const struct lwtunnel_encap_ops *ops; 258 + struct lwtunnel_state *lwtstate; 259 + int ret = -EINVAL; 260 + 261 + if (!dst) 262 + goto drop; 263 + 264 + lwtstate = dst->lwtstate; 265 + 266 + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || 267 + lwtstate->type > LWTUNNEL_ENCAP_MAX) 268 + return 0; 269 + 270 + ret = -EOPNOTSUPP; 271 + rcu_read_lock(); 272 + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); 273 + if (likely(ops && ops->xmit)) 274 + ret = ops->xmit(skb); 275 + rcu_read_unlock(); 276 + 277 + if (ret == -EOPNOTSUPP) 278 + goto drop; 279 + 280 + return ret; 281 + 282 + drop: 283 + kfree_skb(skb); 284 + 285 + return ret; 286 + } 287 + EXPORT_SYMBOL(lwtunnel_xmit); 288 + 254 289 int lwtunnel_input(struct sk_buff *skb) 255 290 { 256 291 struct dst_entry *dst = skb_dst(skb);
+8
net/ipv4/ip_output.c
··· 73 73 #include <net/icmp.h> 74 74 #include <net/checksum.h> 75 75 #include <net/inetpeer.h> 76 + #include <net/lwtunnel.h> 76 77 #include <linux/igmp.h> 77 78 #include <linux/netfilter_ipv4.h> 78 79 #include <linux/netfilter_bridge.h> ··· 196 195 skb_set_owner_w(skb2, skb->sk); 197 196 consume_skb(skb); 198 197 skb = skb2; 198 + } 199 + 200 + if (lwtunnel_xmit_redirect(dst->lwtstate)) { 201 + int res = lwtunnel_xmit(skb); 202 + 203 + if (res < 0 || res == LWTUNNEL_XMIT_DONE) 204 + return res; 199 205 } 200 206 201 207 rcu_read_lock_bh();
+3 -1
net/ipv4/route.c
··· 1246 1246 mtu = 576; 1247 1247 } 1248 1248 1249 - return min_t(unsigned int, mtu, IP_MAX_MTU); 1249 + mtu = min_t(unsigned int, mtu, IP_MAX_MTU); 1250 + 1251 + return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 1250 1252 } 1251 1253 1252 1254 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
+8
net/ipv6/ip6_output.c
··· 56 56 #include <net/checksum.h> 57 57 #include <linux/mroute6.h> 58 58 #include <net/l3mdev.h> 59 + #include <net/lwtunnel.h> 59 60 60 61 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 61 62 { ··· 103 102 kfree_skb(skb); 104 103 return 0; 105 104 } 105 + } 106 + 107 + if (lwtunnel_xmit_redirect(dst->lwtstate)) { 108 + int res = lwtunnel_xmit(skb); 109 + 110 + if (res < 0 || res == LWTUNNEL_XMIT_DONE) 111 + return res; 106 112 } 107 113 108 114 rcu_read_lock_bh();
+3 -1
net/ipv6/route.c
··· 1604 1604 rcu_read_unlock(); 1605 1605 1606 1606 out: 1607 - return min_t(unsigned int, mtu, IP6_MAX_MTU); 1607 + mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1608 + 1609 + return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 1608 1610 } 1609 1611 1610 1612 static struct dst_entry *icmp6_dst_gc_list;
+5 -4
net/mpls/mpls_iptunnel.c
··· 37 37 return en->labels * sizeof(struct mpls_shim_hdr); 38 38 } 39 39 40 - static int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb) 40 + static int mpls_xmit(struct sk_buff *skb) 41 41 { 42 42 struct mpls_iptunnel_encap *tun_encap_info; 43 43 struct mpls_shim_hdr *hdr; ··· 115 115 net_dbg_ratelimited("%s: packet transmission failed: %d\n", 116 116 __func__, err); 117 117 118 - return 0; 118 + return LWTUNNEL_XMIT_DONE; 119 119 120 120 drop: 121 121 kfree_skb(skb); ··· 153 153 if (ret) 154 154 goto errout; 155 155 newts->type = LWTUNNEL_ENCAP_MPLS; 156 - newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; 156 + newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; 157 + newts->headroom = mpls_encap_size(tun_encap_info); 157 158 158 159 *ts = newts; 159 160 ··· 210 209 211 210 static const struct lwtunnel_encap_ops mpls_iptun_ops = { 212 211 .build_state = mpls_build_state, 213 - .output = mpls_output, 212 + .xmit = mpls_xmit, 214 213 .fill_encap = mpls_fill_encap_info, 215 214 .get_encap_size = mpls_encap_nlsize, 216 215 .cmp_encap = mpls_encap_cmp,