Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: add writable context for raw tracepoints

This is an opt-in interface that allows a tracepoint to provide a safe
buffer that can be written from a BPF_PROG_TYPE_RAW_TRACEPOINT program.
The size of the buffer must be a compile-time constant, and is checked
before allowing a BPF program to attach to a tracepoint that uses this
feature.

The pointer to this buffer will be the first argument of tracepoints
that opt in; the pointer is valid and can be bpf_probe_read() by both
BPF_PROG_TYPE_RAW_TRACEPOINT and BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE
programs that attach to such a tracepoint, but the buffer to which it
points may only be written by the latter.

Signed-off-by: Matt Mullins <mmullins@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

authored by

Matt Mullins and committed by
Alexei Starovoitov
9df1c28b 34b8ab09

+91 -4
+2
include/linux/bpf.h
··· 272 272 PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ 273 273 PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ 274 274 PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ 275 + PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ 275 276 }; 276 277 277 278 /* The information passed from prog-specific *_is_valid_access ··· 362 361 u32 used_map_cnt; 363 362 u32 max_ctx_offset; 364 363 u32 max_pkt_offset; 364 + u32 max_tp_access; 365 365 u32 stack_depth; 366 366 u32 id; 367 367 u32 func_cnt; /* used by non-func prog as the number of func progs */
+1
include/linux/bpf_types.h
··· 25 25 BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) 26 26 BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) 27 27 BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) 28 + BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable) 28 29 #endif 29 30 #ifdef CONFIG_CGROUP_BPF 30 31 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
+1
include/linux/tracepoint-defs.h
··· 45 45 struct tracepoint *tp; 46 46 void *bpf_func; 47 47 u32 num_args; 48 + u32 writable_size; 48 49 } __aligned(32); 49 50 50 51 #endif
+25 -2
include/trace/bpf_probe.h
··· 69 69 * to make sure that if the tracepoint handling changes, the 70 70 * bpf probe will fail to compile unless it too is updated. 71 71 */ 72 - #undef DEFINE_EVENT 73 - #define DEFINE_EVENT(template, call, proto, args) \ 72 + #define __DEFINE_EVENT(template, call, proto, args, size) \ 74 73 static inline void bpf_test_probe_##call(void) \ 75 74 { \ 76 75 check_trace_callback_type_##call(__bpf_trace_##template); \ ··· 80 81 .tp = &__tracepoint_##call, \ 81 82 .bpf_func = (void *)__bpf_trace_##template, \ 82 83 .num_args = COUNT_ARGS(args), \ 84 + .writable_size = size, \ 83 85 }; 84 86 87 + #define FIRST(x, ...) x 88 + 89 + #undef DEFINE_EVENT_WRITABLE 90 + #define DEFINE_EVENT_WRITABLE(template, call, proto, args, size) \ 91 + static inline void bpf_test_buffer_##call(void) \ 92 + { \ 93 + /* BUILD_BUG_ON() is ignored if the code is completely eliminated, but \ 94 + * BUILD_BUG_ON_ZERO() uses a different mechanism that is not \ 95 + * dead-code-eliminated. \ 96 + */ \ 97 + FIRST(proto); \ 98 + (void)BUILD_BUG_ON_ZERO(size != sizeof(*FIRST(args))); \ 99 + } \ 100 + __DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), size) 101 + 102 + #undef DEFINE_EVENT 103 + #define DEFINE_EVENT(template, call, proto, args) \ 104 + __DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), 0) 85 105 86 106 #undef DEFINE_EVENT_PRINT 87 107 #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ 88 108 DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) 89 109 90 110 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 111 + 112 + #undef DEFINE_EVENT_WRITABLE 113 + #undef __DEFINE_EVENT 114 + #undef FIRST 115 + 91 116 #endif /* CONFIG_BPF_EVENTS */
+1
include/uapi/linux/bpf.h
··· 168 168 BPF_PROG_TYPE_SK_REUSEPORT, 169 169 BPF_PROG_TYPE_FLOW_DISSECTOR, 170 170 BPF_PROG_TYPE_CGROUP_SYSCTL, 171 + BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, 171 172 }; 172 173 173 174 enum bpf_attach_type {
+6 -2
kernel/bpf/syscall.c
··· 1789 1789 } 1790 1790 raw_tp->btp = btp; 1791 1791 1792 - prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, 1793 - BPF_PROG_TYPE_RAW_TRACEPOINT); 1792 + prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); 1794 1793 if (IS_ERR(prog)) { 1795 1794 err = PTR_ERR(prog); 1796 1795 goto out_free_tp; 1796 + } 1797 + if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && 1798 + prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { 1799 + err = -EINVAL; 1800 + goto out_put_prog; 1797 1801 } 1798 1802 1799 1803 err = bpf_probe_register(raw_tp->btp, prog);
+31
kernel/bpf/verifier.c
··· 405 405 [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", 406 406 [PTR_TO_TCP_SOCK] = "tcp_sock", 407 407 [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", 408 + [PTR_TO_TP_BUFFER] = "tp_buffer", 408 409 }; 409 410 410 411 static char slot_type_char[] = { ··· 1994 1993 return 0; 1995 1994 } 1996 1995 1996 + static int check_tp_buffer_access(struct bpf_verifier_env *env, 1997 + const struct bpf_reg_state *reg, 1998 + int regno, int off, int size) 1999 + { 2000 + if (off < 0) { 2001 + verbose(env, 2002 + "R%d invalid tracepoint buffer access: off=%d, size=%d", 2003 + regno, off, size); 2004 + return -EACCES; 2005 + } 2006 + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { 2007 + char tn_buf[48]; 2008 + 2009 + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); 2010 + verbose(env, 2011 + "R%d invalid variable buffer offset: off=%d, var_off=%s", 2012 + regno, off, tn_buf); 2013 + return -EACCES; 2014 + } 2015 + if (off + size > env->prog->aux->max_tp_access) 2016 + env->prog->aux->max_tp_access = off + size; 2017 + 2018 + return 0; 2019 + } 2020 + 2021 + 1997 2022 /* truncate register to smaller size (in bytes) 1998 2023 * must be called with size < BPF_REG_SIZE 1999 2024 */ ··· 2163 2136 } 2164 2137 err = check_sock_access(env, insn_idx, regno, off, size, t); 2165 2138 if (!err && value_regno >= 0) 2139 + mark_reg_unknown(env, regs, value_regno); 2140 + } else if (reg->type == PTR_TO_TP_BUFFER) { 2141 + err = check_tp_buffer_access(env, reg, regno, off, size); 2142 + if (!err && t == BPF_READ && value_regno >= 0) 2166 2143 mark_reg_unknown(env, regs, value_regno); 2167 2144 } else { 2168 2145 verbose(env, "R%d invalid mem access '%s'\n", regno,
+24
kernel/trace/bpf_trace.c
··· 915 915 const struct bpf_prog_ops raw_tracepoint_prog_ops = { 916 916 }; 917 917 918 + static bool raw_tp_writable_prog_is_valid_access(int off, int size, 919 + enum bpf_access_type type, 920 + const struct bpf_prog *prog, 921 + struct bpf_insn_access_aux *info) 922 + { 923 + if (off == 0) { 924 + if (size != sizeof(u64) || type != BPF_READ) 925 + return false; 926 + info->reg_type = PTR_TO_TP_BUFFER; 927 + } 928 + return raw_tp_prog_is_valid_access(off, size, type, prog, info); 929 + } 930 + 931 + const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = { 932 + .get_func_proto = raw_tp_prog_func_proto, 933 + .is_valid_access = raw_tp_writable_prog_is_valid_access, 934 + }; 935 + 936 + const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = { 937 + }; 938 + 918 939 static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, 919 940 const struct bpf_prog *prog, 920 941 struct bpf_insn_access_aux *info) ··· 1223 1202 * available in this tracepoint 1224 1203 */ 1225 1204 if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) 1205 + return -EINVAL; 1206 + 1207 + if (prog->aux->max_tp_access > btp->writable_size) 1226 1208 return -EINVAL; 1227 1209 1228 1210 return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog);