Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tunnel: drop packet if ECN present with not-ECT

Linux tunnels were written before RFC6040 and therefore never
implemented the corner case of ECN getting set in the outer header
and the inner header not being ready for it.

Section 4.2. Default Tunnel Egress Behaviour.
o If the inner ECN field is Not-ECT, the decapsulator MUST NOT
propagate any other ECN codepoint onwards. This is because the
inner Not-ECT marking is set by transports that rely on dropped
packets as an indication of congestion and would not understand or
respond to any other ECN codepoint [RFC4774]. Specifically:

* If the inner ECN field is Not-ECT and the outer ECN field is
CE, the decapsulator MUST drop the packet.

* If the inner ECN field is Not-ECT and the outer ECN field is
Not-ECT, ECT(0), or ECT(1), the decapsulator MUST forward the
outgoing packet with the ECN field cleared to Not-ECT.

This patch moves the ECN decap logic out of the individual tunnels
into a common place.

It also adds logging to allow detecting broken systems that
set ECN bits incorrectly when tunneling (or an intermediate
router might be changing the header).

Overloads rx_frame_error to keep track of ECN related error.

Thanks to Chris Wright who caught this while reviewing the new VXLAN
tunnel.

This code was tested by injecting faulty logic in other end GRE
to send incorrectly encapsulated packets.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

stephen hemminger and committed by
David S. Miller
eccc1bb8 b0558ef2

+147 -63
+76
include/net/inet_ecn.h
··· 15 15 INET_ECN_MASK = 3, 16 16 }; 17 17 18 + extern int sysctl_tunnel_ecn_log; 19 + 18 20 static inline int INET_ECN_is_ce(__u8 dsfield) 19 21 { 20 22 return (dsfield & INET_ECN_MASK) == INET_ECN_CE; ··· 147 145 return 0; 148 146 } 149 147 148 + /* 149 + * RFC 6080 4.2 150 + * To decapsulate the inner header at the tunnel egress, a compliant 151 + * tunnel egress MUST set the outgoing ECN field to the codepoint at the 152 + * intersection of the appropriate arriving inner header (row) and outer 153 + * header (column) in Figure 4 154 + * 155 + * +---------+------------------------------------------------+ 156 + * |Arriving | Arriving Outer Header | 157 + * | Inner +---------+------------+------------+------------+ 158 + * | Header | Not-ECT | ECT(0) | ECT(1) | CE | 159 + * +---------+---------+------------+------------+------------+ 160 + * | Not-ECT | Not-ECT |Not-ECT(!!!)|Not-ECT(!!!)| <drop>(!!!)| 161 + * | ECT(0) | ECT(0) | ECT(0) | ECT(1) | CE | 162 + * | ECT(1) | ECT(1) | ECT(1) (!) | ECT(1) | CE | 163 + * | CE | CE | CE | CE(!!!)| CE | 164 + * +---------+---------+------------+------------+------------+ 165 + * 166 + * Figure 4: New IP in IP Decapsulation Behaviour 167 + * 168 + * returns 0 on success 169 + * 1 if something is broken and should be logged (!!! above) 170 + * 2 if packet should be dropped 171 + */ 172 + static inline int INET_ECN_decapsulate(struct sk_buff *skb, 173 + __u8 outer, __u8 inner) 174 + { 175 + if (INET_ECN_is_not_ect(inner)) { 176 + switch (outer & INET_ECN_MASK) { 177 + case INET_ECN_NOT_ECT: 178 + return 0; 179 + case INET_ECN_ECT_0: 180 + case INET_ECN_ECT_1: 181 + return 1; 182 + case INET_ECN_CE: 183 + return 2; 184 + } 185 + } 186 + 187 + if (INET_ECN_is_ce(outer)) 188 + INET_ECN_set_ce(skb); 189 + 190 + return 0; 191 + } 192 + 193 + static inline int IP_ECN_decapsulate(const struct iphdr *oiph, 194 + struct sk_buff *skb) 195 + { 196 + __u8 inner; 197 + 198 + if (skb->protocol == htons(ETH_P_IP)) 199 + inner = ip_hdr(skb)->tos; 200 + else if (skb->protocol == htons(ETH_P_IPV6)) 201 + inner = ipv6_get_dsfield(ipv6_hdr(skb)); 202 + else 203 + return 0; 204 + 205 + return INET_ECN_decapsulate(skb, oiph->tos, inner); 206 + } 207 + 208 + static inline int IP6_ECN_decapsulate(const struct ipv6hdr *oipv6h, 209 + struct sk_buff *skb) 210 + { 211 + __u8 inner; 212 + 213 + if (skb->protocol == htons(ETH_P_IP)) 214 + inner = ip_hdr(skb)->tos; 215 + else if (skb->protocol == htons(ETH_P_IPV6)) 216 + inner = ipv6_get_dsfield(ipv6_hdr(skb)); 217 + else 218 + return 0; 219 + 220 + return INET_ECN_decapsulate(skb, ipv6_get_dsfield(oipv6h), inner); 221 + } 150 222 #endif
+22 -16
net/ipv4/ip_gre.c
··· 120 120 Alexey Kuznetsov. 121 121 */ 122 122 123 + static bool log_ecn_error = true; 124 + module_param(log_ecn_error, bool, 0644); 125 + MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); 126 + 123 127 static struct rtnl_link_ops ipgre_link_ops __read_mostly; 124 128 static int ipgre_tunnel_init(struct net_device *dev); 125 129 static void ipgre_tunnel_setup(struct net_device *dev); ··· 208 204 tot->rx_crc_errors = dev->stats.rx_crc_errors; 209 205 tot->rx_fifo_errors = dev->stats.rx_fifo_errors; 210 206 tot->rx_length_errors = dev->stats.rx_length_errors; 207 + tot->rx_frame_errors = dev->stats.rx_frame_errors; 211 208 tot->rx_errors = dev->stats.rx_errors; 209 + 212 210 tot->tx_fifo_errors = dev->stats.tx_fifo_errors; 213 211 tot->tx_carrier_errors = dev->stats.tx_carrier_errors; 214 212 tot->tx_dropped = dev->stats.tx_dropped; ··· 593 587 t->err_time = jiffies; 594 588 } 595 589 596 - static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb) 597 - { 598 - if (INET_ECN_is_ce(iph->tos)) { 599 - if (skb->protocol == htons(ETH_P_IP)) { 600 - IP_ECN_set_ce(ip_hdr(skb)); 601 - } else if (skb->protocol == htons(ETH_P_IPV6)) { 602 - IP6_ECN_set_ce(ipv6_hdr(skb)); 603 - } 604 - } 605 - } 606 - 607 590 static inline u8 608 591 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb) 609 592 { ··· 615 620 struct ip_tunnel *tunnel; 616 621 int offset = 4; 617 622 __be16 gre_proto; 623 + int err; 618 624 619 625 if (!pskb_may_pull(skb, 16)) 620 626 goto drop; ··· 719 723 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 720 724 } 721 725 726 + __skb_tunnel_rx(skb, tunnel->dev); 727 + 728 + skb_reset_network_header(skb); 729 + err = IP_ECN_decapsulate(iph, skb); 730 + if (unlikely(err)) { 731 + if (log_ecn_error) 732 + net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", 733 + &iph->saddr, iph->tos); 734 + if (err > 1) { 735 + ++tunnel->dev->stats.rx_frame_errors; 736 + ++tunnel->dev->stats.rx_errors; 737 + goto drop; 738 + } 739 + } 740 + 722 741 tstats = this_cpu_ptr(tunnel->dev->tstats); 723 742 u64_stats_update_begin(&tstats->syncp); 724 743 tstats->rx_packets++; 725 744 tstats->rx_bytes += skb->len; 726 745 u64_stats_update_end(&tstats->syncp); 727 - 728 - __skb_tunnel_rx(skb, tunnel->dev); 729 - 730 - skb_reset_network_header(skb); 731 - ipgre_ecn_decapsulate(iph, skb); 732 746 733 747 netif_rx(skb); 734 748
+25 -17
net/ipv4/ipip.c
··· 120 120 #define HASH_SIZE 16 121 121 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) 122 122 123 + static bool log_ecn_error = true; 124 + module_param(log_ecn_error, bool, 0644); 125 + MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); 126 + 123 127 static int ipip_net_id __read_mostly; 124 128 struct ipip_net { 125 129 struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; ··· 404 400 return err; 405 401 } 406 402 407 - static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph, 408 - struct sk_buff *skb) 409 - { 410 - struct iphdr *inner_iph = ip_hdr(skb); 411 - 412 - if (INET_ECN_is_ce(outer_iph->tos)) 413 - IP_ECN_set_ce(inner_iph); 414 - } 415 - 416 403 static int ipip_rcv(struct sk_buff *skb) 417 404 { 418 405 struct ip_tunnel *tunnel; 419 406 const struct iphdr *iph = ip_hdr(skb); 407 + int err; 420 408 421 409 tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); 422 410 if (tunnel != NULL) { 423 411 struct pcpu_tstats *tstats; 424 412 425 - if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 426 - kfree_skb(skb); 427 - return 0; 428 - } 413 + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 414 + goto drop; 429 415 430 416 secpath_reset(skb); 431 417 ··· 424 430 skb->protocol = htons(ETH_P_IP); 425 431 skb->pkt_type = PACKET_HOST; 426 432 433 + __skb_tunnel_rx(skb, tunnel->dev); 434 + 435 + err = IP_ECN_decapsulate(iph, skb); 436 + if (unlikely(err)) { 437 + if (log_ecn_error) 438 + net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", 439 + &iph->saddr, iph->tos); 440 + if (err > 1) { 441 + ++tunnel->dev->stats.rx_frame_errors; 442 + ++tunnel->dev->stats.rx_errors; 443 + goto drop; 444 + } 445 + } 446 + 427 447 tstats = this_cpu_ptr(tunnel->dev->tstats); 428 448 u64_stats_update_begin(&tstats->syncp); 429 449 tstats->rx_packets++; 430 450 tstats->rx_bytes += skb->len; 431 451 u64_stats_update_end(&tstats->syncp); 432 452 433 - __skb_tunnel_rx(skb, tunnel->dev); 434 - 435 - ipip_ecn_decapsulate(iph, skb); 436 - 437 453 netif_rx(skb); 438 454 return 0; 439 455 } 440 456 441 457 return -1; 458 + 459 + drop: 460 + kfree_skb(skb); 461 + return 0; 442 462 } 443 463 444 464 /*
+24 -30
net/ipv6/ip6_gre.c
··· 56 56 #include <net/ip6_tunnel.h> 57 57 58 58 59 + static bool log_ecn_error = true; 60 + module_param(log_ecn_error, bool, 0644); 61 + MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); 62 + 59 63 #define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK) 60 64 #define IPV6_TCLASS_SHIFT 20 61 65 ··· 153 149 tot->rx_crc_errors = dev->stats.rx_crc_errors; 154 150 tot->rx_fifo_errors = dev->stats.rx_fifo_errors; 155 151 tot->rx_length_errors = dev->stats.rx_length_errors; 152 + tot->rx_frame_errors = dev->stats.rx_frame_errors; 156 153 tot->rx_errors = dev->stats.rx_errors; 154 + 157 155 tot->tx_fifo_errors = dev->stats.tx_fifo_errors; 158 156 tot->tx_carrier_errors = dev->stats.tx_carrier_errors; 159 157 tot->tx_dropped = dev->stats.tx_dropped; ··· 495 489 t->err_time = jiffies; 496 490 } 497 491 498 - static inline void ip6gre_ecn_decapsulate_ipv4(const struct ip6_tnl *t, 499 - const struct ipv6hdr *ipv6h, struct sk_buff *skb) 500 - { 501 - __u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK; 502 - 503 - if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) 504 - ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield); 505 - 506 - if (INET_ECN_is_ce(dsfield)) 507 - IP_ECN_set_ce(ip_hdr(skb)); 508 - } 509 - 510 - static inline void ip6gre_ecn_decapsulate_ipv6(const struct ip6_tnl *t, 511 - const struct ipv6hdr *ipv6h, struct sk_buff *skb) 512 - { 513 - if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) 514 - ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb)); 515 - 516 - if (INET_ECN_is_ce(ipv6_get_dsfield(ipv6h))) 517 - IP6_ECN_set_ce(ipv6_hdr(skb)); 518 - } 519 - 520 492 static int ip6gre_rcv(struct sk_buff *skb) 521 493 { 522 494 const struct ipv6hdr *ipv6h; ··· 506 522 struct ip6_tnl *tunnel; 507 523 int offset = 4; 508 524 __be16 gre_proto; 525 + int err; 509 526 510 527 if (!pskb_may_pull(skb, sizeof(struct in6_addr))) 511 528 goto drop; ··· 610 625 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 611 626 } 612 627 628 + __skb_tunnel_rx(skb, tunnel->dev); 629 + 630 + skb_reset_network_header(skb); 631 + 632 + err = IP6_ECN_decapsulate(ipv6h, skb); 633 + if (unlikely(err)) { 634 + if (log_ecn_error) 635 + net_info_ratelimited("non-ECT from %pI6 with dsfield=%#x\n", 636 + &ipv6h->saddr, 637 + ipv6_get_dsfield(ipv6h)); 638 + if (err > 1) { 639 + ++tunnel->dev->stats.rx_frame_errors; 640 + ++tunnel->dev->stats.rx_errors; 641 + goto drop; 642 + } 643 + } 644 + 613 645 tstats = this_cpu_ptr(tunnel->dev->tstats); 614 646 u64_stats_update_begin(&tstats->syncp); 615 647 tstats->rx_packets++; 616 648 tstats->rx_bytes += skb->len; 617 649 u64_stats_update_end(&tstats->syncp); 618 - 619 - __skb_tunnel_rx(skb, tunnel->dev); 620 - 621 - skb_reset_network_header(skb); 622 - if (skb->protocol == htons(ETH_P_IP)) 623 - ip6gre_ecn_decapsulate_ipv4(tunnel, ipv6h, skb); 624 - else if (skb->protocol == htons(ETH_P_IPV6)) 625 - ip6gre_ecn_decapsulate_ipv6(tunnel, ipv6h, skb); 626 650 627 651 netif_rx(skb); 628 652