Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: fix bpf helpers to use skb->mac_header relative offsets

For the short-term solution, lets fix bpf helper functions to use
skb->mac_header relative offsets instead of skb->data in order to
get the same eBPF programs with cls_bpf and act_bpf work on ingress
and egress qdisc path. We need to ensure that mac_header is set
before calling into programs. This is effectively the first option
from below referenced discussion.

More long term solution for LD_ABS|LD_IND instructions will be more
intrusive but also more beneficial than this, and implemented later
as it's too risky at this point in time.

I.e., we plan to look into the option of moving skb_pull() out of
eth_type_trans() and into netif_receive_skb() as has been suggested
as second option. Meanwhile, this solution ensures ingress can be
used with eBPF, too, and that we won't run into ABI troubles later.
For dealing with negative offsets inside eBPF helper functions,
we've implemented bpf_skb_clone_unwritable() to test for unwriteable
headers.

Reference: http://thread.gmane.org/gmane.linux.network/359129/focus=359694
Fixes: 608cd71a9c7c ("tc: bpf: generalize pedit action")
Fixes: 91bc4822c3d6 ("tc: bpf: add checksum helpers")
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Alexei Starovoitov and committed by
David S. Miller
a166151c 51b5df88

+50 -22
+1 -1
include/uapi/linux/bpf.h
··· 177 177 /** 178 178 * skb_store_bytes(skb, offset, from, len, flags) - store bytes into packet 179 179 * @skb: pointer to skb 180 - * @offset: offset within packet from skb->data 180 + * @offset: offset within packet from skb->mac_header 181 181 * @from: pointer where to copy bytes from 182 182 * @len: number of bytes to store into packet 183 183 * @flags: bit 0 - if true, recompute skb->csum
+5 -2
include/uapi/linux/filter.h
··· 79 79 #define SKF_AD_RANDOM 56 80 80 #define SKF_AD_VLAN_TPID 60 81 81 #define SKF_AD_MAX 64 82 - #define SKF_NET_OFF (-0x100000) 83 - #define SKF_LL_OFF (-0x200000) 84 82 83 + #define SKF_NET_OFF (-0x100000) 84 + #define SKF_LL_OFF (-0x200000) 85 + 86 + #define BPF_NET_OFF SKF_NET_OFF 87 + #define BPF_LL_OFF SKF_LL_OFF 85 88 86 89 #endif /* _UAPI__LINUX_FILTER_H__ */
+32 -9
net/core/filter.c
··· 1175 1175 return 0; 1176 1176 } 1177 1177 1178 + /** 1179 + * bpf_skb_clone_not_writable - is the header of a clone not writable 1180 + * @skb: buffer to check 1181 + * @len: length up to which to write, can be negative 1182 + * 1183 + * Returns true if modifying the header part of the cloned buffer 1184 + * does require the data to be copied. I.e. this version works with 1185 + * negative lengths needed for eBPF case! 1186 + */ 1187 + static bool bpf_skb_clone_unwritable(const struct sk_buff *skb, int len) 1188 + { 1189 + return skb_header_cloned(skb) || 1190 + (int) skb_headroom(skb) + len > skb->hdr_len; 1191 + } 1192 + 1178 1193 #define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1) 1179 1194 1180 1195 static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) 1181 1196 { 1182 1197 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1183 - unsigned int offset = (unsigned int) r2; 1198 + int offset = (int) r2; 1184 1199 void *from = (void *) (long) r3; 1185 1200 unsigned int len = (unsigned int) r4; 1186 1201 char buf[16]; ··· 1209 1194 * 1210 1195 * so check for invalid 'offset' and too large 'len' 1211 1196 */ 1212 - if (unlikely(offset > 0xffff || len > sizeof(buf))) 1197 + if (unlikely((u32) offset > 0xffff || len > sizeof(buf))) 1213 1198 return -EFAULT; 1214 1199 1215 - if (skb_cloned(skb) && !skb_clone_writable(skb, offset + len)) 1200 + offset -= skb->data - skb_mac_header(skb); 1201 + if (unlikely(skb_cloned(skb) && 1202 + bpf_skb_clone_unwritable(skb, offset + len))) 1216 1203 return -EFAULT; 1217 1204 1218 1205 ptr = skb_header_pointer(skb, offset, len, buf); ··· 1249 1232 #define BPF_HEADER_FIELD_SIZE(flags) ((flags) & 0x0f) 1250 1233 #define BPF_IS_PSEUDO_HEADER(flags) ((flags) & 0x10) 1251 1234 1252 - static u64 bpf_l3_csum_replace(u64 r1, u64 offset, u64 from, u64 to, u64 flags) 1235 + static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) 1253 1236 { 1254 1237 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1238 + int offset = (int) r2; 1255 1239 __sum16 sum, *ptr; 1256 1240 1257 - if (unlikely(offset > 0xffff)) 1241 + if (unlikely((u32) offset > 0xffff)) 1258 1242 return -EFAULT; 1259 1243 1260 - if (skb_cloned(skb) && !skb_clone_writable(skb, offset + sizeof(sum))) 1244 + offset -= skb->data - skb_mac_header(skb); 1245 + if (unlikely(skb_cloned(skb) && 1246 + bpf_skb_clone_unwritable(skb, offset + sizeof(sum)))) 1261 1247 return -EFAULT; 1262 1248 1263 1249 ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); ··· 1296 1276 .arg5_type = ARG_ANYTHING, 1297 1277 }; 1298 1278 1299 - static u64 bpf_l4_csum_replace(u64 r1, u64 offset, u64 from, u64 to, u64 flags) 1279 + static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) 1300 1280 { 1301 1281 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1302 1282 u32 is_pseudo = BPF_IS_PSEUDO_HEADER(flags); 1283 + int offset = (int) r2; 1303 1284 __sum16 sum, *ptr; 1304 1285 1305 - if (unlikely(offset > 0xffff)) 1286 + if (unlikely((u32) offset > 0xffff)) 1306 1287 return -EFAULT; 1307 1288 1308 - if (skb_cloned(skb) && !skb_clone_writable(skb, offset + sizeof(sum))) 1289 + offset -= skb->data - skb_mac_header(skb); 1290 + if (unlikely(skb_cloned(skb) && 1291 + bpf_skb_clone_unwritable(skb, offset + sizeof(sum)))) 1309 1292 return -EFAULT; 1310 1293 1311 1294 ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
+3
net/sched/act_bpf.c
··· 38 38 struct tcf_bpf *prog = act->priv; 39 39 int action, filter_res; 40 40 41 + if (unlikely(!skb_mac_header_was_set(skb))) 42 + return TC_ACT_UNSPEC; 43 + 41 44 spin_lock(&prog->tcf_lock); 42 45 43 46 prog->tcf_tm.lastuse = jiffies;
+3
net/sched/cls_bpf.c
··· 66 66 struct cls_bpf_prog *prog; 67 67 int ret = -1; 68 68 69 + if (unlikely(!skb_mac_header_was_set(skb))) 70 + return -1; 71 + 69 72 /* Needed here for accessing maps. */ 70 73 rcu_read_lock(); 71 74 list_for_each_entry_rcu(prog, &head->plist, link) {
+6 -10
samples/bpf/tcbpf1_kern.c
··· 4 4 #include <uapi/linux/ip.h> 5 5 #include <uapi/linux/in.h> 6 6 #include <uapi/linux/tcp.h> 7 + #include <uapi/linux/filter.h> 8 + 7 9 #include "bpf_helpers.h" 8 10 9 11 /* compiler workaround */ ··· 16 14 bpf_skb_store_bytes(skb, 0, mac, ETH_ALEN, 1); 17 15 } 18 16 19 - /* use 1 below for ingress qdisc and 0 for egress */ 20 - #if 0 21 - #undef ETH_HLEN 22 - #define ETH_HLEN 0 23 - #endif 24 - 25 17 #define IP_CSUM_OFF (ETH_HLEN + offsetof(struct iphdr, check)) 26 18 #define TOS_OFF (ETH_HLEN + offsetof(struct iphdr, tos)) 27 19 28 20 static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos) 29 21 { 30 - __u8 old_tos = load_byte(skb, TOS_OFF); 22 + __u8 old_tos = load_byte(skb, BPF_LL_OFF + TOS_OFF); 31 23 32 24 bpf_l3_csum_replace(skb, IP_CSUM_OFF, htons(old_tos), htons(new_tos), 2); 33 25 bpf_skb_store_bytes(skb, TOS_OFF, &new_tos, sizeof(new_tos), 0); ··· 34 38 35 39 static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip) 36 40 { 37 - __u32 old_ip = _htonl(load_word(skb, IP_SRC_OFF)); 41 + __u32 old_ip = _htonl(load_word(skb, BPF_LL_OFF + IP_SRC_OFF)); 38 42 39 43 bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_ip, new_ip, IS_PSEUDO | sizeof(new_ip)); 40 44 bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip)); ··· 44 48 #define TCP_DPORT_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, dest)) 45 49 static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port) 46 50 { 47 - __u16 old_port = htons(load_half(skb, TCP_DPORT_OFF)); 51 + __u16 old_port = htons(load_half(skb, BPF_LL_OFF + TCP_DPORT_OFF)); 48 52 49 53 bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_port, new_port, sizeof(new_port)); 50 54 bpf_skb_store_bytes(skb, TCP_DPORT_OFF, &new_port, sizeof(new_port), 0); ··· 53 57 SEC("classifier") 54 58 int bpf_prog1(struct __sk_buff *skb) 55 59 { 56 - __u8 proto = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); 60 + __u8 proto = load_byte(skb, BPF_LL_OFF + ETH_HLEN + offsetof(struct iphdr, protocol)); 57 61 long *value; 58 62 59 63 if (proto == IPPROTO_TCP) {