Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sched: consistently handle layer3 header accesses in the presence of VLANs

There are a couple of places in net/sched/ that check skb->protocol and act
on the value there. However, in the presence of VLAN tags, the value stored
in skb->protocol can be inconsistent based on whether VLAN acceleration is
enabled. The commit quoted in the Fixes tag below fixed the users of
skb->protocol to use a helper that will always see the VLAN ethertype.

However, most of the callers don't actually handle the VLAN ethertype, but
expect to find the IP header type in the protocol field. This means that
things like changing the ECN field, or parsing diffserv values, stops
working if there's a VLAN tag, or if there are multiple nested VLAN
tags (QinQ).

To fix this, change the helper to take an argument that indicates whether
the caller wants to skip the VLAN tags or not. When skipping VLAN tags, we
make sure to skip all of them, so behaviour is consistent even in QinQ
mode.

To make the helper usable from the ECN code, move it to if_vlan.h instead
of pkt_sched.h.

v3:
- Remove empty lines
- Move vlan variable definitions inside loop in skb_protocol()
- Also use skb_protocol() helper in IP{,6}_ECN_decapsulate() and
bpf_skb_ecn_set_ce()

v2:
- Use eth_type_vlan() helper in skb_protocol()
- Also fix code that reads skb->protocol directly
- Change a couple of 'if/else if' statements to switch constructs to avoid
calling the helper twice

Reported-by: Ilya Ponetayev <i.ponetaev@ndmsystems.com>
Fixes: d8b9605d2697 ("net: sched: fix skb->protocol use in case of accelerated vlan path")
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Toke Høiland-Jørgensen and committed by
David S. Miller
d7bf2ebe ad4e2b64

+86 -51
+28
include/linux/if_vlan.h
··· 308 308 } 309 309 } 310 310 311 + /* A getter for the SKB protocol field which will handle VLAN tags consistently 312 + * whether VLAN acceleration is enabled or not. 313 + */ 314 + static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan) 315 + { 316 + unsigned int offset = skb_mac_offset(skb) + sizeof(struct ethhdr); 317 + __be16 proto = skb->protocol; 318 + 319 + if (!skip_vlan) 320 + /* VLAN acceleration strips the VLAN header from the skb and 321 + * moves it to skb->vlan_proto 322 + */ 323 + return skb_vlan_tag_present(skb) ? skb->vlan_proto : proto; 324 + 325 + while (eth_type_vlan(proto)) { 326 + struct vlan_hdr vhdr, *vh; 327 + 328 + vh = skb_header_pointer(skb, offset, sizeof(vhdr), &vhdr); 329 + if (!vh) 330 + break; 331 + 332 + proto = vh->h_vlan_encapsulated_proto; 333 + offset += sizeof(vhdr); 334 + } 335 + 336 + return proto; 337 + } 338 + 311 339 static inline bool vlan_hw_offload_capable(netdev_features_t features, 312 340 __be16 proto) 313 341 {
+17 -8
include/net/inet_ecn.h
··· 4 4 5 5 #include <linux/ip.h> 6 6 #include <linux/skbuff.h> 7 + #include <linux/if_vlan.h> 7 8 8 9 #include <net/inet_sock.h> 9 10 #include <net/dsfield.h> ··· 173 172 174 173 static inline int INET_ECN_set_ce(struct sk_buff *skb) 175 174 { 176 - switch (skb->protocol) { 175 + switch (skb_protocol(skb, true)) { 177 176 case cpu_to_be16(ETH_P_IP): 178 177 if (skb_network_header(skb) + sizeof(struct iphdr) <= 179 178 skb_tail_pointer(skb)) ··· 192 191 193 192 static inline int INET_ECN_set_ect1(struct sk_buff *skb) 194 193 { 195 - switch (skb->protocol) { 194 + switch (skb_protocol(skb, true)) { 196 195 case cpu_to_be16(ETH_P_IP): 197 196 if (skb_network_header(skb) + sizeof(struct iphdr) <= 198 197 skb_tail_pointer(skb)) ··· 273 272 { 274 273 __u8 inner; 275 274 276 - if (skb->protocol == htons(ETH_P_IP)) 275 + switch (skb_protocol(skb, true)) { 276 + case htons(ETH_P_IP): 277 277 inner = ip_hdr(skb)->tos; 278 - else if (skb->protocol == htons(ETH_P_IPV6)) 278 + break; 279 + case htons(ETH_P_IPV6): 279 280 inner = ipv6_get_dsfield(ipv6_hdr(skb)); 280 - else 281 + break; 282 + default: 281 283 return 0; 284 + } 282 285 283 286 return INET_ECN_decapsulate(skb, oiph->tos, inner); 284 287 } ··· 292 287 { 293 288 __u8 inner; 294 289 295 - if (skb->protocol == htons(ETH_P_IP)) 290 + switch (skb_protocol(skb, true)) { 291 + case htons(ETH_P_IP): 296 292 inner = ip_hdr(skb)->tos; 297 - else if (skb->protocol == htons(ETH_P_IPV6)) 293 + break; 294 + case htons(ETH_P_IPV6): 298 295 inner = ipv6_get_dsfield(ipv6_hdr(skb)); 299 - else 296 + break; 297 + default: 300 298 return 0; 299 + } 301 300 302 301 return INET_ECN_decapsulate(skb, ipv6_get_dsfield(oipv6h), inner); 303 302 }
-11
include/net/pkt_sched.h
··· 136 136 } 137 137 } 138 138 139 - static inline __be16 tc_skb_protocol(const struct sk_buff *skb) 140 - { 141 - /* We need to take extra care in case the skb came via 142 - * vlan accelerated path. In that case, use skb->vlan_proto 143 - * as the original vlan header was already stripped. 144 - */ 145 - if (skb_vlan_tag_present(skb)) 146 - return skb->vlan_proto; 147 - return skb->protocol; 148 - } 149 - 150 139 /* Calculate maximal size of packet seen by hard_start_xmit 151 140 routine of this device. 152 141 */
+7 -3
net/core/filter.c
··· 5853 5853 { 5854 5854 unsigned int iphdr_len; 5855 5855 5856 - if (skb->protocol == cpu_to_be16(ETH_P_IP)) 5856 + switch (skb_protocol(skb, true)) { 5857 + case cpu_to_be16(ETH_P_IP): 5857 5858 iphdr_len = sizeof(struct iphdr); 5858 - else if (skb->protocol == cpu_to_be16(ETH_P_IPV6)) 5859 + break; 5860 + case cpu_to_be16(ETH_P_IPV6): 5859 5861 iphdr_len = sizeof(struct ipv6hdr); 5860 - else 5862 + break; 5863 + default: 5861 5864 return 0; 5865 + } 5862 5866 5863 5867 if (skb_headlen(skb) < iphdr_len) 5864 5868 return 0;
+6 -3
net/sched/act_connmark.c
··· 43 43 tcf_lastuse_update(&ca->tcf_tm); 44 44 bstats_update(&ca->tcf_bstats, skb); 45 45 46 - if (skb->protocol == htons(ETH_P_IP)) { 46 + switch (skb_protocol(skb, true)) { 47 + case htons(ETH_P_IP): 47 48 if (skb->len < sizeof(struct iphdr)) 48 49 goto out; 49 50 50 51 proto = NFPROTO_IPV4; 51 - } else if (skb->protocol == htons(ETH_P_IPV6)) { 52 + break; 53 + case htons(ETH_P_IPV6): 52 54 if (skb->len < sizeof(struct ipv6hdr)) 53 55 goto out; 54 56 55 57 proto = NFPROTO_IPV6; 56 - } else { 58 + break; 59 + default: 57 60 goto out; 58 61 } 59 62
+1 -1
net/sched/act_csum.c
··· 587 587 goto drop; 588 588 589 589 update_flags = params->update_flags; 590 - protocol = tc_skb_protocol(skb); 590 + protocol = skb_protocol(skb, false); 591 591 again: 592 592 switch (protocol) { 593 593 case cpu_to_be16(ETH_P_IP):
+4 -5
net/sched/act_ct.c
··· 624 624 { 625 625 u8 family = NFPROTO_UNSPEC; 626 626 627 - switch (skb->protocol) { 627 + switch (skb_protocol(skb, true)) { 628 628 case htons(ETH_P_IP): 629 629 family = NFPROTO_IPV4; 630 630 break; ··· 748 748 const struct nf_nat_range2 *range, 749 749 enum nf_nat_manip_type maniptype) 750 750 { 751 + __be16 proto = skb_protocol(skb, true); 751 752 int hooknum, err = NF_ACCEPT; 752 753 753 754 /* See HOOK2MANIP(). */ ··· 760 759 switch (ctinfo) { 761 760 case IP_CT_RELATED: 762 761 case IP_CT_RELATED_REPLY: 763 - if (skb->protocol == htons(ETH_P_IP) && 762 + if (proto == htons(ETH_P_IP) && 764 763 ip_hdr(skb)->protocol == IPPROTO_ICMP) { 765 764 if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, 766 765 hooknum)) 767 766 err = NF_DROP; 768 767 goto out; 769 - } else if (IS_ENABLED(CONFIG_IPV6) && 770 - skb->protocol == htons(ETH_P_IPV6)) { 768 + } else if (IS_ENABLED(CONFIG_IPV6) && proto == htons(ETH_P_IPV6)) { 771 769 __be16 frag_off; 772 770 u8 nexthdr = ipv6_hdr(skb)->nexthdr; 773 771 int hdrlen = ipv6_skip_exthdr(skb, ··· 1550 1550 MODULE_AUTHOR("Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>"); 1551 1551 MODULE_DESCRIPTION("Connection tracking action"); 1552 1552 MODULE_LICENSE("GPL v2"); 1553 -
+6 -3
net/sched/act_ctinfo.c
··· 96 96 action = READ_ONCE(ca->tcf_action); 97 97 98 98 wlen = skb_network_offset(skb); 99 - if (tc_skb_protocol(skb) == htons(ETH_P_IP)) { 99 + switch (skb_protocol(skb, true)) { 100 + case htons(ETH_P_IP): 100 101 wlen += sizeof(struct iphdr); 101 102 if (!pskb_may_pull(skb, wlen)) 102 103 goto out; 103 104 104 105 proto = NFPROTO_IPV4; 105 - } else if (tc_skb_protocol(skb) == htons(ETH_P_IPV6)) { 106 + break; 107 + case htons(ETH_P_IPV6): 106 108 wlen += sizeof(struct ipv6hdr); 107 109 if (!pskb_may_pull(skb, wlen)) 108 110 goto out; 109 111 110 112 proto = NFPROTO_IPV6; 111 - } else { 113 + break; 114 + default: 112 115 goto out; 113 116 } 114 117
+1 -1
net/sched/act_mpls.c
··· 82 82 goto drop; 83 83 break; 84 84 case TCA_MPLS_ACT_PUSH: 85 - new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb->protocol)); 85 + new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb_protocol(skb, true))); 86 86 if (skb_mpls_push(skb, new_lse, p->tcfm_proto, mac_len, 87 87 skb->dev && skb->dev->type == ARPHRD_ETHER)) 88 88 goto drop;
+1 -1
net/sched/act_skbedit.c
··· 41 41 if (params->flags & SKBEDIT_F_INHERITDSFIELD) { 42 42 int wlen = skb_network_offset(skb); 43 43 44 - switch (tc_skb_protocol(skb)) { 44 + switch (skb_protocol(skb, true)) { 45 45 case htons(ETH_P_IP): 46 46 wlen += sizeof(struct iphdr); 47 47 if (!pskb_may_pull(skb, wlen))
+1 -1
net/sched/cls_api.c
··· 1538 1538 reclassify: 1539 1539 #endif 1540 1540 for (; tp; tp = rcu_dereference_bh(tp->next)) { 1541 - __be16 protocol = tc_skb_protocol(skb); 1541 + __be16 protocol = skb_protocol(skb, false); 1542 1542 int err; 1543 1543 1544 1544 if (tp->protocol != protocol &&
+4 -4
net/sched/cls_flow.c
··· 80 80 if (dst) 81 81 return ntohl(dst); 82 82 83 - return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); 83 + return addr_fold(skb_dst(skb)) ^ (__force u16)skb_protocol(skb, true); 84 84 } 85 85 86 86 static u32 flow_get_proto(const struct sk_buff *skb, ··· 104 104 if (flow->ports.ports) 105 105 return ntohs(flow->ports.dst); 106 106 107 - return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); 107 + return addr_fold(skb_dst(skb)) ^ (__force u16)skb_protocol(skb, true); 108 108 } 109 109 110 110 static u32 flow_get_iif(const struct sk_buff *skb) ··· 151 151 static u32 flow_get_nfct_src(const struct sk_buff *skb, 152 152 const struct flow_keys *flow) 153 153 { 154 - switch (tc_skb_protocol(skb)) { 154 + switch (skb_protocol(skb, true)) { 155 155 case htons(ETH_P_IP): 156 156 return ntohl(CTTUPLE(skb, src.u3.ip)); 157 157 case htons(ETH_P_IPV6): ··· 164 164 static u32 flow_get_nfct_dst(const struct sk_buff *skb, 165 165 const struct flow_keys *flow) 166 166 { 167 - switch (tc_skb_protocol(skb)) { 167 + switch (skb_protocol(skb, true)) { 168 168 case htons(ETH_P_IP): 169 169 return ntohl(CTTUPLE(skb, dst.u3.ip)); 170 170 case htons(ETH_P_IPV6):
+1 -1
net/sched/cls_flower.c
··· 313 313 /* skb_flow_dissect() does not set n_proto in case an unknown 314 314 * protocol, so do it rather here. 315 315 */ 316 - skb_key.basic.n_proto = skb->protocol; 316 + skb_key.basic.n_proto = skb_protocol(skb, false); 317 317 skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key); 318 318 skb_flow_dissect_ct(skb, &mask->dissector, &skb_key, 319 319 fl_ct_info_to_flower_map,
+1 -1
net/sched/em_ipset.c
··· 59 59 }; 60 60 int ret, network_offset; 61 61 62 - switch (tc_skb_protocol(skb)) { 62 + switch (skb_protocol(skb, true)) { 63 63 case htons(ETH_P_IP): 64 64 state.pf = NFPROTO_IPV4; 65 65 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
+1 -1
net/sched/em_ipt.c
··· 212 212 struct nf_hook_state state; 213 213 int ret; 214 214 215 - switch (tc_skb_protocol(skb)) { 215 + switch (skb_protocol(skb, true)) { 216 216 case htons(ETH_P_IP): 217 217 if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) 218 218 return 0;
+1 -1
net/sched/em_meta.c
··· 195 195 META_COLLECTOR(int_protocol) 196 196 { 197 197 /* Let userspace take care of the byte ordering */ 198 - dst->value = tc_skb_protocol(skb); 198 + dst->value = skb_protocol(skb, false); 199 199 } 200 200 201 201 META_COLLECTOR(int_pkttype)
+2 -2
net/sched/sch_cake.c
··· 592 592 bool rev = !skb->_nfct, upd = false; 593 593 __be32 ip; 594 594 595 - if (tc_skb_protocol(skb) != htons(ETH_P_IP)) 595 + if (skb_protocol(skb, true) != htons(ETH_P_IP)) 596 596 return false; 597 597 598 598 if (!nf_ct_get_tuple_skb(&tuple, skb)) ··· 1557 1557 u16 *buf, buf_; 1558 1558 u8 dscp; 1559 1559 1560 - switch (tc_skb_protocol(skb)) { 1560 + switch (skb_protocol(skb, true)) { 1561 1561 case htons(ETH_P_IP): 1562 1562 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); 1563 1563 if (unlikely(!buf))
+3 -3
net/sched/sch_dsmark.c
··· 210 210 if (p->set_tc_index) { 211 211 int wlen = skb_network_offset(skb); 212 212 213 - switch (tc_skb_protocol(skb)) { 213 + switch (skb_protocol(skb, true)) { 214 214 case htons(ETH_P_IP): 215 215 wlen += sizeof(struct iphdr); 216 216 if (!pskb_may_pull(skb, wlen) || ··· 303 303 index = skb->tc_index & (p->indices - 1); 304 304 pr_debug("index %d->%d\n", skb->tc_index, index); 305 305 306 - switch (tc_skb_protocol(skb)) { 306 + switch (skb_protocol(skb, true)) { 307 307 case htons(ETH_P_IP): 308 308 ipv4_change_dsfield(ip_hdr(skb), p->mv[index].mask, 309 309 p->mv[index].value); ··· 320 320 */ 321 321 if (p->mv[index].mask != 0xff || p->mv[index].value) 322 322 pr_warn("%s: unsupported protocol %d\n", 323 - __func__, ntohs(tc_skb_protocol(skb))); 323 + __func__, ntohs(skb_protocol(skb, true))); 324 324 break; 325 325 } 326 326
+1 -1
net/sched/sch_teql.c
··· 239 239 char haddr[MAX_ADDR_LEN]; 240 240 241 241 neigh_ha_snapshot(haddr, n, dev); 242 - err = dev_hard_header(skb, dev, ntohs(tc_skb_protocol(skb)), 242 + err = dev_hard_header(skb, dev, ntohs(skb_protocol(skb, false)), 243 243 haddr, NULL, skb->len); 244 244 245 245 if (err < 0)