Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: support SKF_NET_OFF and SKF_LL_OFF on skb frags

Classic BPF socket filters with SKB_NET_OFF and SKB_LL_OFF fail to
read when these offsets extend into frags.

This has been observed with iwlwifi and reproduced with tun with
IFF_NAPI_FRAGS. The below straightforward socket filter on UDP port,
applied to a RAW socket, will silently miss matching packets.

const int offset_proto = offsetof(struct ip6_hdr, ip6_nxt);
const int offset_dport = sizeof(struct ip6_hdr) + offsetof(struct udphdr, dest);
struct sock_filter filter_code[] = {
BPF_STMT(BPF_LD + BPF_B + BPF_ABS, SKF_AD_OFF + SKF_AD_PKTTYPE),
BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, PACKET_HOST, 0, 4),
BPF_STMT(BPF_LD + BPF_B + BPF_ABS, SKF_NET_OFF + offset_proto),
BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_UDP, 0, 2),
BPF_STMT(BPF_LD + BPF_H + BPF_ABS, SKF_NET_OFF + offset_dport),

This is unexpected behavior. Socket filter programs should be
consistent regardless of environment. Silent misses are
particularly concerning as hard to detect.

Use skb_copy_bits for offsets outside linear, same as done for
non-SKF_(LL|NET) offsets.

Offset is always positive after subtracting the reference threshold
SKB_(LL|NET)_OFF, so is always >= skb_(mac|network)_offset. The sum of
the two is an offset against skb->data, and may be negative, but it
cannot point before skb->head, as skb_(mac|network)_offset would too.

This appears to go back to when frag support was introduced to
sk_run_filter in linux-2.4.4, before the introduction of git.

The amount of code change and 8/16/32 bit duplication are unfortunate.
But any attempt I made to be smarter saved very few LoC while
complicating the code.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Link: https://lore.kernel.org/netdev/20250122200402.3461154-1-maze@google.com/
Link: https://elixir.bootlin.com/linux/2.4.4/source/net/core/filter.c#L244
Reported-by: Matt Moeller <moeller.matt@gmail.com>
Co-developed-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://lore.kernel.org/r/20250408132833.195491-2-willemdebruijn.kernel@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

authored by

Willem de Bruijn and committed by
Alexei Starovoitov
d4bac028 9bae8f4f

+44 -36
+44 -36
net/core/filter.c
··· 218 218 return 0; 219 219 } 220 220 221 + static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset) 222 + { 223 + if (likely(offset >= 0)) 224 + return offset; 225 + 226 + if (offset >= SKF_NET_OFF) 227 + return offset - SKF_NET_OFF + skb_network_offset(skb); 228 + 229 + if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb)) 230 + return offset - SKF_LL_OFF + skb_mac_offset(skb); 231 + 232 + return INT_MIN; 233 + } 234 + 221 235 BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, 222 236 data, int, headlen, int, offset) 223 237 { 224 - u8 tmp, *ptr; 238 + u8 tmp; 225 239 const int len = sizeof(tmp); 226 240 227 - if (offset >= 0) { 228 - if (headlen - offset >= len) 229 - return *(u8 *)(data + offset); 230 - if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) 231 - return tmp; 232 - } else { 233 - ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); 234 - if (likely(ptr)) 235 - return *(u8 *)ptr; 236 - } 241 + offset = bpf_skb_load_helper_convert_offset(skb, offset); 242 + if (offset == INT_MIN) 243 + return -EFAULT; 237 244 238 - return -EFAULT; 245 + if (headlen - offset >= len) 246 + return *(u8 *)(data + offset); 247 + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) 248 + return tmp; 249 + else 250 + return -EFAULT; 239 251 } 240 252 241 253 BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, ··· 260 248 BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, 261 249 data, int, headlen, int, offset) 262 250 { 263 - __be16 tmp, *ptr; 251 + __be16 tmp; 264 252 const int len = sizeof(tmp); 265 253 266 - if (offset >= 0) { 267 - if (headlen - offset >= len) 268 - return get_unaligned_be16(data + offset); 269 - if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) 270 - return be16_to_cpu(tmp); 271 - } else { 272 - ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); 273 - if (likely(ptr)) 274 - return get_unaligned_be16(ptr); 275 - } 254 + offset = bpf_skb_load_helper_convert_offset(skb, offset); 255 + if (offset == INT_MIN) 256 + return -EFAULT; 276 257 277 - return -EFAULT; 258 + if (headlen - offset >= len) 259 + return get_unaligned_be16(data + offset); 260 + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) 261 + return be16_to_cpu(tmp); 262 + else 263 + return -EFAULT; 278 264 } 279 265 280 266 BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, ··· 285 275 BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, 286 276 data, int, headlen, int, offset) 287 277 { 288 - __be32 tmp, *ptr; 278 + __be32 tmp; 289 279 const int len = sizeof(tmp); 290 280 291 - if (likely(offset >= 0)) { 292 - if (headlen - offset >= len) 293 - return get_unaligned_be32(data + offset); 294 - if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) 295 - return be32_to_cpu(tmp); 296 - } else { 297 - ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); 298 - if (likely(ptr)) 299 - return get_unaligned_be32(ptr); 300 - } 281 + offset = bpf_skb_load_helper_convert_offset(skb, offset); 282 + if (offset == INT_MIN) 283 + return -EFAULT; 301 284 302 - return -EFAULT; 285 + if (headlen - offset >= len) 286 + return get_unaligned_be32(data + offset); 287 + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) 288 + return be32_to_cpu(tmp); 289 + else 290 + return -EFAULT; 303 291 } 304 292 305 293 BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,