Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

flow_dissector: switch kernel context to struct bpf_flow_dissector

struct bpf_flow_dissector has a small subset of sk_buff fields that
flow dissector BPF program is allowed to access and an optional
pointer to real skb. Real skb is used only in bpf_skb_load_bytes
helper to read non-linear data.

The real motivation for this is to be able to call flow dissector
from eth_get_headlen context where we don't have an skb and need
to dissect raw bytes.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>

authored by

Stanislav Fomichev and committed by
Daniel Borkmann
089b19a9 7e6e185c

+117 -59
+4
include/linux/skbuff.h
··· 1275 1275 } 1276 1276 #endif 1277 1277 1278 + struct bpf_flow_dissector; 1279 + bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, 1280 + __be16 proto, int nhoff, int hlen); 1281 + 1278 1282 struct bpf_flow_keys; 1279 1283 bool __skb_flow_bpf_dissect(struct bpf_prog *prog, 1280 1284 const struct sk_buff *skb,
+7
include/net/flow_dissector.h
··· 305 305 return ((char *)target_container) + flow_dissector->offset[key_id]; 306 306 } 307 307 308 + struct bpf_flow_dissector { 309 + struct bpf_flow_keys *flow_keys; 310 + const struct sk_buff *skb; 311 + void *data; 312 + void *data_end; 313 + }; 314 + 308 315 #endif
+4 -7
include/net/sch_generic.h
··· 364 364 }; 365 365 366 366 struct qdisc_skb_cb { 367 - union { 368 - struct { 369 - unsigned int pkt_len; 370 - u16 slave_dev_queue_mapping; 371 - u16 tc_classid; 372 - }; 373 - struct bpf_flow_keys *flow_keys; 367 + struct { 368 + unsigned int pkt_len; 369 + u16 slave_dev_queue_mapping; 370 + u16 tc_classid; 374 371 }; 375 372 #define QDISC_CB_PRIV_LEN 20 376 373 unsigned char data[QDISC_CB_PRIV_LEN];
-4
net/bpf/test_run.c
··· 382 382 u32 repeat = kattr->test.repeat; 383 383 struct bpf_flow_keys flow_keys; 384 384 u64 time_start, time_spent = 0; 385 - struct bpf_skb_data_end *cb; 386 385 u32 retval, duration; 387 386 struct sk_buff *skb; 388 387 struct sock *sk; ··· 421 422 skb->protocol = eth_type_trans(skb, 422 423 current->nsproxy->net_ns->loopback_dev); 423 424 skb_reset_network_header(skb); 424 - 425 - cb = (struct bpf_skb_data_end *)skb->cb; 426 - cb->qdisc_cb.flow_keys = &flow_keys; 427 425 428 426 if (!repeat) 429 427 repeat = 1;
+82 -23
net/core/filter.c
··· 1730 1730 .arg4_type = ARG_CONST_SIZE, 1731 1731 }; 1732 1732 1733 + BPF_CALL_4(bpf_flow_dissector_load_bytes, 1734 + const struct bpf_flow_dissector *, ctx, u32, offset, 1735 + void *, to, u32, len) 1736 + { 1737 + void *ptr; 1738 + 1739 + if (unlikely(offset > 0xffff)) 1740 + goto err_clear; 1741 + 1742 + if (unlikely(!ctx->skb)) 1743 + goto err_clear; 1744 + 1745 + ptr = skb_header_pointer(ctx->skb, offset, len, to); 1746 + if (unlikely(!ptr)) 1747 + goto err_clear; 1748 + if (ptr != to) 1749 + memcpy(to, ptr, len); 1750 + 1751 + return 0; 1752 + err_clear: 1753 + memset(to, 0, len); 1754 + return -EFAULT; 1755 + } 1756 + 1757 + static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = { 1758 + .func = bpf_flow_dissector_load_bytes, 1759 + .gpl_only = false, 1760 + .ret_type = RET_INTEGER, 1761 + .arg1_type = ARG_PTR_TO_CTX, 1762 + .arg2_type = ARG_ANYTHING, 1763 + .arg3_type = ARG_PTR_TO_UNINIT_MEM, 1764 + .arg4_type = ARG_CONST_SIZE, 1765 + }; 1766 + 1733 1767 BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, 1734 1768 u32, offset, void *, to, u32, len, u32, start_header) 1735 1769 { ··· 6155 6121 { 6156 6122 switch (func_id) { 6157 6123 case BPF_FUNC_skb_load_bytes: 6158 - return &bpf_skb_load_bytes_proto; 6124 + return &bpf_flow_dissector_load_bytes_proto; 6159 6125 default: 6160 6126 return bpf_base_func_proto(func_id); 6161 6127 } ··· 6282 6248 return false; 6283 6249 break; 6284 6250 case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): 6285 - if (size != sizeof(__u64)) 6286 - return false; 6287 - break; 6251 + return false; 6288 6252 case bpf_ctx_range(struct __sk_buff, tstamp): 6289 6253 if (size != sizeof(__u64)) 6290 6254 return false; ··· 6317 6285 case bpf_ctx_range(struct __sk_buff, data): 6318 6286 case bpf_ctx_range(struct __sk_buff, data_meta): 6319 6287 case bpf_ctx_range(struct __sk_buff, data_end): 6320 - case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): 6321 6288 case bpf_ctx_range_till(struct __sk_buff, family, local_port): 6322 6289 case bpf_ctx_range(struct __sk_buff, tstamp): 6323 6290 case bpf_ctx_range(struct __sk_buff, wire_len): ··· 6343 6312 switch (off) { 6344 6313 case bpf_ctx_range(struct __sk_buff, tc_classid): 6345 6314 case bpf_ctx_range(struct __sk_buff, data_meta): 6346 - case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): 6347 6315 case bpf_ctx_range(struct __sk_buff, wire_len): 6348 6316 return false; 6349 6317 case bpf_ctx_range(struct __sk_buff, data): ··· 6388 6358 case bpf_ctx_range(struct __sk_buff, tc_classid): 6389 6359 case bpf_ctx_range_till(struct __sk_buff, family, local_port): 6390 6360 case bpf_ctx_range(struct __sk_buff, data_meta): 6391 - case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): 6392 6361 case bpf_ctx_range(struct __sk_buff, tstamp): 6393 6362 case bpf_ctx_range(struct __sk_buff, wire_len): 6394 6363 return false; ··· 6630 6601 case bpf_ctx_range(struct __sk_buff, data_end): 6631 6602 info->reg_type = PTR_TO_PACKET_END; 6632 6603 break; 6633 - case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): 6634 6604 case bpf_ctx_range_till(struct __sk_buff, family, local_port): 6635 6605 return false; 6636 6606 } ··· 6831 6803 switch (off) { 6832 6804 case bpf_ctx_range(struct __sk_buff, tc_classid): 6833 6805 case bpf_ctx_range(struct __sk_buff, data_meta): 6834 - case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): 6835 6806 case bpf_ctx_range(struct __sk_buff, tstamp): 6836 6807 case bpf_ctx_range(struct __sk_buff, wire_len): 6837 6808 return false; ··· 6904 6877 const struct bpf_prog *prog, 6905 6878 struct bpf_insn_access_aux *info) 6906 6879 { 6880 + const int size_default = sizeof(__u32); 6881 + 6882 + if (off < 0 || off >= sizeof(struct __sk_buff)) 6883 + return false; 6884 + 6907 6885 if (type == BPF_WRITE) 6908 6886 return false; 6909 6887 6910 6888 switch (off) { 6911 6889 case bpf_ctx_range(struct __sk_buff, data): 6890 + if (size != size_default) 6891 + return false; 6912 6892 info->reg_type = PTR_TO_PACKET; 6913 - break; 6893 + return true; 6914 6894 case bpf_ctx_range(struct __sk_buff, data_end): 6895 + if (size != size_default) 6896 + return false; 6915 6897 info->reg_type = PTR_TO_PACKET_END; 6916 - break; 6898 + return true; 6917 6899 case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): 6900 + if (size != sizeof(__u64)) 6901 + return false; 6918 6902 info->reg_type = PTR_TO_FLOW_KEYS; 6919 - break; 6903 + return true; 6920 6904 default: 6921 6905 return false; 6922 6906 } 6907 + } 6923 6908 6924 - return bpf_skb_is_valid_access(off, size, type, prog, info); 6909 + static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, 6910 + const struct bpf_insn *si, 6911 + struct bpf_insn *insn_buf, 6912 + struct bpf_prog *prog, 6913 + u32 *target_size) 6914 + 6915 + { 6916 + struct bpf_insn *insn = insn_buf; 6917 + 6918 + switch (si->off) { 6919 + case offsetof(struct __sk_buff, data): 6920 + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data), 6921 + si->dst_reg, si->src_reg, 6922 + offsetof(struct bpf_flow_dissector, data)); 6923 + break; 6924 + 6925 + case offsetof(struct __sk_buff, data_end): 6926 + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end), 6927 + si->dst_reg, si->src_reg, 6928 + offsetof(struct bpf_flow_dissector, data_end)); 6929 + break; 6930 + 6931 + case offsetof(struct __sk_buff, flow_keys): 6932 + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys), 6933 + si->dst_reg, si->src_reg, 6934 + offsetof(struct bpf_flow_dissector, flow_keys)); 6935 + break; 6936 + } 6937 + 6938 + return insn - insn_buf; 6925 6939 } 6926 6940 6927 6941 static u32 bpf_convert_ctx_access(enum bpf_access_type type, ··· 7267 7199 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 7268 7200 bpf_target_off(struct sock_common, 7269 7201 skc_num, 2, target_size)); 7270 - break; 7271 - 7272 - case offsetof(struct __sk_buff, flow_keys): 7273 - off = si->off; 7274 - off -= offsetof(struct __sk_buff, flow_keys); 7275 - off += offsetof(struct sk_buff, cb); 7276 - off += offsetof(struct qdisc_skb_cb, flow_keys); 7277 - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, 7278 - si->src_reg, off); 7279 7202 break; 7280 7203 7281 7204 case offsetof(struct __sk_buff, tstamp): ··· 8273 8214 const struct bpf_verifier_ops flow_dissector_verifier_ops = { 8274 8215 .get_func_proto = flow_dissector_func_proto, 8275 8216 .is_valid_access = flow_dissector_is_valid_access, 8276 - .convert_ctx_access = bpf_convert_ctx_access, 8217 + .convert_ctx_access = flow_dissector_convert_ctx_access, 8277 8218 }; 8278 8219 8279 8220 const struct bpf_prog_ops flow_dissector_prog_ops = {
+20 -25
net/core/flow_dissector.c
··· 688 688 struct flow_dissector *flow_dissector, 689 689 struct bpf_flow_keys *flow_keys) 690 690 { 691 - struct bpf_skb_data_end cb_saved; 692 - struct bpf_skb_data_end *cb; 691 + struct bpf_flow_dissector ctx = { 692 + .flow_keys = flow_keys, 693 + .skb = skb, 694 + .data = skb->data, 695 + .data_end = skb->data + skb_headlen(skb), 696 + }; 697 + 698 + return bpf_flow_dissect(prog, &ctx, skb->protocol, 699 + skb_network_offset(skb), skb_headlen(skb)); 700 + } 701 + 702 + bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, 703 + __be16 proto, int nhoff, int hlen) 704 + { 705 + struct bpf_flow_keys *flow_keys = ctx->flow_keys; 693 706 u32 result; 694 - 695 - /* Note that even though the const qualifier is discarded 696 - * throughout the execution of the BPF program, all changes(the 697 - * control block) are reverted after the BPF program returns. 698 - * Therefore, __skb_flow_dissect does not alter the skb. 699 - */ 700 - 701 - cb = (struct bpf_skb_data_end *)skb->cb; 702 - 703 - /* Save Control Block */ 704 - memcpy(&cb_saved, cb, sizeof(cb_saved)); 705 - memset(cb, 0, sizeof(*cb)); 706 707 707 708 /* Pass parameters to the BPF program */ 708 709 memset(flow_keys, 0, sizeof(*flow_keys)); 709 - cb->qdisc_cb.flow_keys = flow_keys; 710 - flow_keys->n_proto = skb->protocol; 711 - flow_keys->nhoff = skb_network_offset(skb); 710 + flow_keys->n_proto = proto; 711 + flow_keys->nhoff = nhoff; 712 712 flow_keys->thoff = flow_keys->nhoff; 713 713 714 - bpf_compute_data_pointers((struct sk_buff *)skb); 715 - result = BPF_PROG_RUN(prog, skb); 714 + result = BPF_PROG_RUN(prog, ctx); 716 715 717 - /* Restore state */ 718 - memcpy(cb, &cb_saved, sizeof(cb_saved)); 719 - 720 - flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, 721 - skb_network_offset(skb), skb->len); 716 + flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen); 722 717 flow_keys->thoff = clamp_t(u16, flow_keys->thoff, 723 - flow_keys->nhoff, skb->len); 718 + flow_keys->nhoff, hlen); 724 719 725 720 return result == BPF_OK; 726 721 }