bpf: introduce bpf subcommand BPF_TASK_FD_QUERY

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
. prog_id
. tracepoint name, or
. k[ret]probe funcname + offset or kernel addr, or
. u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

authored by

Yonghong Song and committed by

Alexei Starovoitov 8 years ago 41bdc4b4 f8d959a5

+273

6 changed files

expand all

include

linux

trace_events.h

uapi

linux

bpf.h

kernel

bpf

syscall.c

trace

bpf_trace.c

trace_kprobe.c

trace_uprobe.c

+17

include/linux/trace_events.h

··· 473 473 int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); 474 474 int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); 475 475 struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); 476 + int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, 477 + u32 *fd_type, const char **buf, 478 + u64 *probe_offset, u64 *probe_addr); 476 479 #else 477 480 static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) 478 481 { ··· 506 503 static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) 507 504 { 508 505 return NULL; 506 + } 507 + static inline int bpf_get_perf_event_info(const struct perf_event *event, 508 + u32 *prog_id, u32 *fd_type, 509 + const char **buf, u64 *probe_offset, 510 + u64 *probe_addr) 511 + { 512 + return -EOPNOTSUPP; 509 513 } 510 514 #endif 511 515 ··· 570 560 #ifdef CONFIG_KPROBE_EVENTS 571 561 extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe); 572 562 extern void perf_kprobe_destroy(struct perf_event *event); 563 + extern int bpf_get_kprobe_info(const struct perf_event *event, 564 + u32 *fd_type, const char **symbol, 565 + u64 *probe_offset, u64 *probe_addr, 566 + bool perf_type_tracepoint); 573 567 #endif 574 568 #ifdef CONFIG_UPROBE_EVENTS 575 569 extern int perf_uprobe_init(struct perf_event *event, bool is_retprobe); 576 570 extern void perf_uprobe_destroy(struct perf_event *event); 571 + extern int bpf_get_uprobe_info(const struct perf_event *event, 572 + u32 *fd_type, const char **filename, 573 + u64 *probe_offset, bool perf_type_tracepoint); 577 574 #endif 578 575 extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, 579 576 char *filter_str);

+26

include/uapi/linux/bpf.h

··· 97 97 BPF_RAW_TRACEPOINT_OPEN, 98 98 BPF_BTF_LOAD, 99 99 BPF_BTF_GET_FD_BY_ID, 100 + BPF_TASK_FD_QUERY, 100 101 }; 101 102 102 103 enum bpf_map_type { ··· 381 380 __u32 btf_log_size; 382 381 __u32 btf_log_level; 383 382 }; 383 + 384 + struct { 385 + __u32 pid; /* input: pid */ 386 + __u32 fd; /* input: fd */ 387 + __u32 flags; /* input: flags */ 388 + __u32 buf_len; /* input/output: buf len */ 389 + __aligned_u64 buf; /* input/output: 390 + * tp_name for tracepoint 391 + * symbol for kprobe 392 + * filename for uprobe 393 + */ 394 + __u32 prog_id; /* output: prod_id */ 395 + __u32 fd_type; /* output: BPF_FD_TYPE_* */ 396 + __u64 probe_offset; /* output: probe_offset */ 397 + __u64 probe_addr; /* output: probe_addr */ 398 + } task_fd_query; 384 399 } __attribute__((aligned(8))); 385 400 386 401 /* The description below is an attempt at providing documentation to eBPF ··· 2572 2555 __be16 h_vlan_TCI; 2573 2556 __u8 smac[6]; /* ETH_ALEN */ 2574 2557 __u8 dmac[6]; /* ETH_ALEN */ 2558 + }; 2559 + 2560 + enum bpf_task_fd_type { 2561 + BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ 2562 + BPF_FD_TYPE_TRACEPOINT, /* tp name */ 2563 + BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */ 2564 + BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */ 2565 + BPF_FD_TYPE_UPROBE, /* filename + offset */ 2566 + BPF_FD_TYPE_URETPROBE, /* filename + offset */ 2575 2567 }; 2576 2568 2577 2569 #endif /* _UAPI__LINUX_BPF_H__ */

+131

kernel/bpf/syscall.c

··· 18 18 #include <linux/vmalloc.h> 19 19 #include <linux/mmzone.h> 20 20 #include <linux/anon_inodes.h> 21 + #include <linux/fdtable.h> 21 22 #include <linux/file.h> 23 + #include <linux/fs.h> 22 24 #include <linux/license.h> 23 25 #include <linux/filter.h> 24 26 #include <linux/version.h> ··· 2180 2178 return btf_get_fd_by_id(attr->btf_id); 2181 2179 } 2182 2180 2181 + static int bpf_task_fd_query_copy(const union bpf_attr *attr, 2182 + union bpf_attr __user *uattr, 2183 + u32 prog_id, u32 fd_type, 2184 + const char *buf, u64 probe_offset, 2185 + u64 probe_addr) 2186 + { 2187 + char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf); 2188 + u32 len = buf ? strlen(buf) : 0, input_len; 2189 + int err = 0; 2190 + 2191 + if (put_user(len, &uattr->task_fd_query.buf_len)) 2192 + return -EFAULT; 2193 + input_len = attr->task_fd_query.buf_len; 2194 + if (input_len && ubuf) { 2195 + if (!len) { 2196 + /* nothing to copy, just make ubuf NULL terminated */ 2197 + char zero = '\0'; 2198 + 2199 + if (put_user(zero, ubuf)) 2200 + return -EFAULT; 2201 + } else if (input_len >= len + 1) { 2202 + /* ubuf can hold the string with NULL terminator */ 2203 + if (copy_to_user(ubuf, buf, len + 1)) 2204 + return -EFAULT; 2205 + } else { 2206 + /* ubuf cannot hold the string with NULL terminator, 2207 + * do a partial copy with NULL terminator. 2208 + */ 2209 + char zero = '\0'; 2210 + 2211 + err = -ENOSPC; 2212 + if (copy_to_user(ubuf, buf, input_len - 1)) 2213 + return -EFAULT; 2214 + if (put_user(zero, ubuf + input_len - 1)) 2215 + return -EFAULT; 2216 + } 2217 + } 2218 + 2219 + if (put_user(prog_id, &uattr->task_fd_query.prog_id) || 2220 + put_user(fd_type, &uattr->task_fd_query.fd_type) || 2221 + put_user(probe_offset, &uattr->task_fd_query.probe_offset) || 2222 + put_user(probe_addr, &uattr->task_fd_query.probe_addr)) 2223 + return -EFAULT; 2224 + 2225 + return err; 2226 + } 2227 + 2228 + #define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr 2229 + 2230 + static int bpf_task_fd_query(const union bpf_attr *attr, 2231 + union bpf_attr __user *uattr) 2232 + { 2233 + pid_t pid = attr->task_fd_query.pid; 2234 + u32 fd = attr->task_fd_query.fd; 2235 + const struct perf_event *event; 2236 + struct files_struct *files; 2237 + struct task_struct *task; 2238 + struct file *file; 2239 + int err; 2240 + 2241 + if (CHECK_ATTR(BPF_TASK_FD_QUERY)) 2242 + return -EINVAL; 2243 + 2244 + if (!capable(CAP_SYS_ADMIN)) 2245 + return -EPERM; 2246 + 2247 + if (attr->task_fd_query.flags != 0) 2248 + return -EINVAL; 2249 + 2250 + task = get_pid_task(find_vpid(pid), PIDTYPE_PID); 2251 + if (!task) 2252 + return -ENOENT; 2253 + 2254 + files = get_files_struct(task); 2255 + put_task_struct(task); 2256 + if (!files) 2257 + return -ENOENT; 2258 + 2259 + err = 0; 2260 + spin_lock(&files->file_lock); 2261 + file = fcheck_files(files, fd); 2262 + if (!file) 2263 + err = -EBADF; 2264 + else 2265 + get_file(file); 2266 + spin_unlock(&files->file_lock); 2267 + put_files_struct(files); 2268 + 2269 + if (err) 2270 + goto out; 2271 + 2272 + if (file->f_op == &bpf_raw_tp_fops) { 2273 + struct bpf_raw_tracepoint *raw_tp = file->private_data; 2274 + struct bpf_raw_event_map *btp = raw_tp->btp; 2275 + 2276 + err = bpf_task_fd_query_copy(attr, uattr, 2277 + raw_tp->prog->aux->id, 2278 + BPF_FD_TYPE_RAW_TRACEPOINT, 2279 + btp->tp->name, 0, 0); 2280 + goto put_file; 2281 + } 2282 + 2283 + event = perf_get_event(file); 2284 + if (!IS_ERR(event)) { 2285 + u64 probe_offset, probe_addr; 2286 + u32 prog_id, fd_type; 2287 + const char *buf; 2288 + 2289 + err = bpf_get_perf_event_info(event, &prog_id, &fd_type, 2290 + &buf, &probe_offset, 2291 + &probe_addr); 2292 + if (!err) 2293 + err = bpf_task_fd_query_copy(attr, uattr, prog_id, 2294 + fd_type, buf, 2295 + probe_offset, 2296 + probe_addr); 2297 + goto put_file; 2298 + } 2299 + 2300 + err = -ENOTSUPP; 2301 + put_file: 2302 + fput(file); 2303 + out: 2304 + return err; 2305 + } 2306 + 2183 2307 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) 2184 2308 { 2185 2309 union bpf_attr attr = {}; ··· 2391 2263 break; 2392 2264 case BPF_BTF_GET_FD_BY_ID: 2393 2265 err = bpf_btf_get_fd_by_id(&attr); 2266 + break; 2267 + case BPF_TASK_FD_QUERY: 2268 + err = bpf_task_fd_query(&attr, uattr); 2394 2269 break; 2395 2270 default: 2396 2271 err = -EINVAL;

+48

kernel/trace/bpf_trace.c

··· 14 14 #include <linux/uaccess.h> 15 15 #include <linux/ctype.h> 16 16 #include <linux/kprobes.h> 17 + #include <linux/syscalls.h> 17 18 #include <linux/error-injection.h> 18 19 19 20 #include "trace_probe.h" ··· 1162 1161 mutex_lock(&bpf_event_mutex); 1163 1162 err = tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); 1164 1163 mutex_unlock(&bpf_event_mutex); 1164 + return err; 1165 + } 1166 + 1167 + int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, 1168 + u32 *fd_type, const char **buf, 1169 + u64 *probe_offset, u64 *probe_addr) 1170 + { 1171 + bool is_tracepoint, is_syscall_tp; 1172 + struct bpf_prog *prog; 1173 + int flags, err = 0; 1174 + 1175 + prog = event->prog; 1176 + if (!prog) 1177 + return -ENOENT; 1178 + 1179 + /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */ 1180 + if (prog->type == BPF_PROG_TYPE_PERF_EVENT) 1181 + return -EOPNOTSUPP; 1182 + 1183 + *prog_id = prog->aux->id; 1184 + flags = event->tp_event->flags; 1185 + is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT; 1186 + is_syscall_tp = is_syscall_trace_event(event->tp_event); 1187 + 1188 + if (is_tracepoint || is_syscall_tp) { 1189 + *buf = is_tracepoint ? event->tp_event->tp->name 1190 + : event->tp_event->name; 1191 + *fd_type = BPF_FD_TYPE_TRACEPOINT; 1192 + *probe_offset = 0x0; 1193 + *probe_addr = 0x0; 1194 + } else { 1195 + /* kprobe/uprobe */ 1196 + err = -EOPNOTSUPP; 1197 + #ifdef CONFIG_KPROBE_EVENTS 1198 + if (flags & TRACE_EVENT_FL_KPROBE) 1199 + err = bpf_get_kprobe_info(event, fd_type, buf, 1200 + probe_offset, probe_addr, 1201 + event->attr.type == PERF_TYPE_TRACEPOINT); 1202 + #endif 1203 + #ifdef CONFIG_UPROBE_EVENTS 1204 + if (flags & TRACE_EVENT_FL_UPROBE) 1205 + err = bpf_get_uprobe_info(event, fd_type, buf, 1206 + probe_offset, 1207 + event->attr.type == PERF_TYPE_TRACEPOINT); 1208 + #endif 1209 + } 1210 + 1165 1211 return err; 1166 1212 }

+29

kernel/trace/trace_kprobe.c

··· 1287 1287 head, NULL); 1288 1288 } 1289 1289 NOKPROBE_SYMBOL(kretprobe_perf_func); 1290 + 1291 + int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, 1292 + const char **symbol, u64 *probe_offset, 1293 + u64 *probe_addr, bool perf_type_tracepoint) 1294 + { 1295 + const char *pevent = trace_event_name(event->tp_event); 1296 + const char *group = event->tp_event->class->system; 1297 + struct trace_kprobe *tk; 1298 + 1299 + if (perf_type_tracepoint) 1300 + tk = find_trace_kprobe(pevent, group); 1301 + else 1302 + tk = event->tp_event->data; 1303 + if (!tk) 1304 + return -EINVAL; 1305 + 1306 + *fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE 1307 + : BPF_FD_TYPE_KPROBE; 1308 + if (tk->symbol) { 1309 + *symbol = tk->symbol; 1310 + *probe_offset = tk->rp.kp.offset; 1311 + *probe_addr = 0; 1312 + } else { 1313 + *symbol = NULL; 1314 + *probe_offset = 0; 1315 + *probe_addr = (unsigned long)tk->rp.kp.addr; 1316 + } 1317 + return 0; 1318 + } 1290 1319 #endif /* CONFIG_PERF_EVENTS */ 1291 1320 1292 1321 /*

+22

kernel/trace/trace_uprobe.c

··· 1161 1161 { 1162 1162 __uprobe_perf_func(tu, func, regs, ucb, dsize); 1163 1163 } 1164 + 1165 + int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, 1166 + const char **filename, u64 *probe_offset, 1167 + bool perf_type_tracepoint) 1168 + { 1169 + const char *pevent = trace_event_name(event->tp_event); 1170 + const char *group = event->tp_event->class->system; 1171 + struct trace_uprobe *tu; 1172 + 1173 + if (perf_type_tracepoint) 1174 + tu = find_probe_event(pevent, group); 1175 + else 1176 + tu = event->tp_event->data; 1177 + if (!tu) 1178 + return -EINVAL; 1179 + 1180 + *fd_type = is_ret_probe(tu) ? BPF_FD_TYPE_URETPROBE 1181 + : BPF_FD_TYPE_UPROBE; 1182 + *filename = tu->filename; 1183 + *probe_offset = tu->offset; 1184 + return 0; 1185 + } 1164 1186 #endif /* CONFIG_PERF_EVENTS */ 1165 1187 1166 1188 static int