Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

perf: Add PERF_RECORD_NAMESPACES to include namespaces related info

With the advert of container technologies like docker, that depend on
namespaces for isolation, there is a need for tracing support for
namespaces. This patch introduces new PERF_RECORD_NAMESPACES event for
recording namespaces related info. By recording info for every
namespace, it is left to userspace to take a call on the definition of a
container and trace containers by updating perf tool accordingly.

Each namespace has a combination of device and inode numbers. Though
every namespace has the same device number currently, that may change in
future to avoid the need for a namespace of namespaces. Considering such
possibility, record both device and inode numbers separately for each
namespace.

Signed-off-by: Hari Bathini <hbathini@linux.vnet.ibm.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/148891929686.25309.2827618988917007768.stgit@hbathini.in.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

authored by

Hari Bathini and committed by
Arnaldo Carvalho de Melo
e4222673 3ef5b402

+177 -1
+2
include/linux/perf_event.h
··· 1112 1112 1113 1113 extern void perf_event_exec(void); 1114 1114 extern void perf_event_comm(struct task_struct *tsk, bool exec); 1115 + extern void perf_event_namespaces(struct task_struct *tsk); 1115 1116 extern void perf_event_fork(struct task_struct *tsk); 1116 1117 1117 1118 /* Callchains */ ··· 1316 1315 static inline void perf_event_mmap(struct vm_area_struct *vma) { } 1317 1316 static inline void perf_event_exec(void) { } 1318 1317 static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } 1318 + static inline void perf_event_namespaces(struct task_struct *tsk) { } 1319 1319 static inline void perf_event_fork(struct task_struct *tsk) { } 1320 1320 static inline void perf_event_init(void) { } 1321 1321 static inline int perf_swevent_get_recursion_context(void) { return -1; }
+31 -1
include/uapi/linux/perf_event.h
··· 344 344 use_clockid : 1, /* use @clockid for time fields */ 345 345 context_switch : 1, /* context switch data */ 346 346 write_backward : 1, /* Write ring buffer from end to beginning */ 347 - __reserved_1 : 36; 347 + namespaces : 1, /* include namespaces data */ 348 + __reserved_1 : 35; 348 349 349 350 union { 350 351 __u32 wakeup_events; /* wakeup every n events */ ··· 611 610 __u16 size; 612 611 }; 613 612 613 + struct perf_ns_link_info { 614 + __u64 dev; 615 + __u64 ino; 616 + }; 617 + 618 + enum { 619 + NET_NS_INDEX = 0, 620 + UTS_NS_INDEX = 1, 621 + IPC_NS_INDEX = 2, 622 + PID_NS_INDEX = 3, 623 + USER_NS_INDEX = 4, 624 + MNT_NS_INDEX = 5, 625 + CGROUP_NS_INDEX = 6, 626 + 627 + NR_NAMESPACES, /* number of available namespaces */ 628 + }; 629 + 614 630 enum perf_event_type { 615 631 616 632 /* ··· 879 861 * }; 880 862 */ 881 863 PERF_RECORD_SWITCH_CPU_WIDE = 15, 864 + 865 + /* 866 + * struct { 867 + * struct perf_event_header header; 868 + * u32 pid; 869 + * u32 tid; 870 + * u64 nr_namespaces; 871 + * { u64 dev, inode; } [nr_namespaces]; 872 + * struct sample_id sample_id; 873 + * }; 874 + */ 875 + PERF_RECORD_NAMESPACES = 16, 882 876 883 877 PERF_RECORD_MAX, /* non-ABI */ 884 878 };
+139
kernel/events/core.c
··· 48 48 #include <linux/parser.h> 49 49 #include <linux/sched/clock.h> 50 50 #include <linux/sched/mm.h> 51 + #include <linux/proc_ns.h> 52 + #include <linux/mount.h> 51 53 52 54 #include "internal.h" 53 55 ··· 381 379 382 380 static atomic_t nr_mmap_events __read_mostly; 383 381 static atomic_t nr_comm_events __read_mostly; 382 + static atomic_t nr_namespaces_events __read_mostly; 384 383 static atomic_t nr_task_events __read_mostly; 385 384 static atomic_t nr_freq_events __read_mostly; 386 385 static atomic_t nr_switch_events __read_mostly; ··· 3994 3991 atomic_dec(&nr_mmap_events); 3995 3992 if (event->attr.comm) 3996 3993 atomic_dec(&nr_comm_events); 3994 + if (event->attr.namespaces) 3995 + atomic_dec(&nr_namespaces_events); 3997 3996 if (event->attr.task) 3998 3997 atomic_dec(&nr_task_events); 3999 3998 if (event->attr.freq) ··· 6496 6491 void perf_event_fork(struct task_struct *task) 6497 6492 { 6498 6493 perf_event_task(task, NULL, 1); 6494 + perf_event_namespaces(task); 6499 6495 } 6500 6496 6501 6497 /* ··· 6596 6590 }; 6597 6591 6598 6592 perf_event_comm_event(&comm_event); 6593 + } 6594 + 6595 + /* 6596 + * namespaces tracking 6597 + */ 6598 + 6599 + struct perf_namespaces_event { 6600 + struct task_struct *task; 6601 + 6602 + struct { 6603 + struct perf_event_header header; 6604 + 6605 + u32 pid; 6606 + u32 tid; 6607 + u64 nr_namespaces; 6608 + struct perf_ns_link_info link_info[NR_NAMESPACES]; 6609 + } event_id; 6610 + }; 6611 + 6612 + static int perf_event_namespaces_match(struct perf_event *event) 6613 + { 6614 + return event->attr.namespaces; 6615 + } 6616 + 6617 + static void perf_event_namespaces_output(struct perf_event *event, 6618 + void *data) 6619 + { 6620 + struct perf_namespaces_event *namespaces_event = data; 6621 + struct perf_output_handle handle; 6622 + struct perf_sample_data sample; 6623 + int ret; 6624 + 6625 + if (!perf_event_namespaces_match(event)) 6626 + return; 6627 + 6628 + perf_event_header__init_id(&namespaces_event->event_id.header, 6629 + &sample, event); 6630 + ret = perf_output_begin(&handle, event, 6631 + namespaces_event->event_id.header.size); 6632 + if (ret) 6633 + return; 6634 + 6635 + namespaces_event->event_id.pid = perf_event_pid(event, 6636 + namespaces_event->task); 6637 + namespaces_event->event_id.tid = perf_event_tid(event, 6638 + namespaces_event->task); 6639 + 6640 + perf_output_put(&handle, namespaces_event->event_id); 6641 + 6642 + perf_event__output_id_sample(event, &handle, &sample); 6643 + 6644 + perf_output_end(&handle); 6645 + } 6646 + 6647 + static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, 6648 + struct task_struct *task, 6649 + const struct proc_ns_operations *ns_ops) 6650 + { 6651 + struct path ns_path; 6652 + struct inode *ns_inode; 6653 + void *error; 6654 + 6655 + error = ns_get_path(&ns_path, task, ns_ops); 6656 + if (!error) { 6657 + ns_inode = ns_path.dentry->d_inode; 6658 + ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev); 6659 + ns_link_info->ino = ns_inode->i_ino; 6660 + } 6661 + } 6662 + 6663 + void perf_event_namespaces(struct task_struct *task) 6664 + { 6665 + struct perf_namespaces_event namespaces_event; 6666 + struct perf_ns_link_info *ns_link_info; 6667 + 6668 + if (!atomic_read(&nr_namespaces_events)) 6669 + return; 6670 + 6671 + namespaces_event = (struct perf_namespaces_event){ 6672 + .task = task, 6673 + .event_id = { 6674 + .header = { 6675 + .type = PERF_RECORD_NAMESPACES, 6676 + .misc = 0, 6677 + .size = sizeof(namespaces_event.event_id), 6678 + }, 6679 + /* .pid */ 6680 + /* .tid */ 6681 + .nr_namespaces = NR_NAMESPACES, 6682 + /* .link_info[NR_NAMESPACES] */ 6683 + }, 6684 + }; 6685 + 6686 + ns_link_info = namespaces_event.event_id.link_info; 6687 + 6688 + perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX], 6689 + task, &mntns_operations); 6690 + 6691 + #ifdef CONFIG_USER_NS 6692 + perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX], 6693 + task, &userns_operations); 6694 + #endif 6695 + #ifdef CONFIG_NET_NS 6696 + perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX], 6697 + task, &netns_operations); 6698 + #endif 6699 + #ifdef CONFIG_UTS_NS 6700 + perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX], 6701 + task, &utsns_operations); 6702 + #endif 6703 + #ifdef CONFIG_IPC_NS 6704 + perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX], 6705 + task, &ipcns_operations); 6706 + #endif 6707 + #ifdef CONFIG_PID_NS 6708 + perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX], 6709 + task, &pidns_operations); 6710 + #endif 6711 + #ifdef CONFIG_CGROUPS 6712 + perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX], 6713 + task, &cgroupns_operations); 6714 + #endif 6715 + 6716 + perf_iterate_sb(perf_event_namespaces_output, 6717 + &namespaces_event, 6718 + NULL); 6599 6719 } 6600 6720 6601 6721 /* ··· 9278 9146 atomic_inc(&nr_mmap_events); 9279 9147 if (event->attr.comm) 9280 9148 atomic_inc(&nr_comm_events); 9149 + if (event->attr.namespaces) 9150 + atomic_inc(&nr_namespaces_events); 9281 9151 if (event->attr.task) 9282 9152 atomic_inc(&nr_task_events); 9283 9153 if (event->attr.freq) ··· 9822 9688 9823 9689 if (!attr.exclude_kernel) { 9824 9690 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) 9691 + return -EACCES; 9692 + } 9693 + 9694 + if (attr.namespaces) { 9695 + if (!capable(CAP_SYS_ADMIN)) 9825 9696 return -EACCES; 9826 9697 } 9827 9698
+2
kernel/fork.c
··· 2352 2352 } 2353 2353 } 2354 2354 2355 + perf_event_namespaces(current); 2356 + 2355 2357 bad_unshare_cleanup_cred: 2356 2358 if (new_cred) 2357 2359 put_cred(new_cred);
+3
kernel/nsproxy.c
··· 26 26 #include <linux/file.h> 27 27 #include <linux/syscalls.h> 28 28 #include <linux/cgroup.h> 29 + #include <linux/perf_event.h> 29 30 30 31 static struct kmem_cache *nsproxy_cachep; 31 32 ··· 263 262 goto out; 264 263 } 265 264 switch_task_namespaces(tsk, new_nsproxy); 265 + 266 + perf_event_namespaces(tsk); 266 267 out: 267 268 fput(file); 268 269 return err;