Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

kcmp: add KCMP_EPOLL_TFD mode to compare epoll target files

With current epoll architecture target files are addressed with
file_struct and file descriptor number, where the last is not unique.
Moreover files can be transferred from another process via unix socket,
added into queue and closed then so we won't find this descriptor in the
task fdinfo list.

Thus to checkpoint and restore such processes CRIU needs to find out
where exactly the target file is present to add it into epoll queue.
For this sake one can use kcmp call where some particular target file
from the queue is compared with arbitrary file passed as an argument.

Because epoll target files can have same file descriptor number but
different file_struct a caller should explicitly specify the offset
within.

To test if some particular file is matching entry inside epoll one have
to

- fill kcmp_epoll_slot structure with epoll file descriptor,
target file number and target file offset (in case if only
one target is present then it should be 0)

- call kcmp as kcmp(pid1, pid2, KCMP_EPOLL_TFD, fd, &kcmp_epoll_slot)
- the kernel fetch file pointer matching file descriptor @fd of pid1
- lookups for file struct in epoll queue of pid2 and returns traditional
0,1,2 result for sorting purpose

Link: http://lkml.kernel.org/r/20170424154423.511592110@gmail.com
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Andrey Vagin <avagin@openvz.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Cyrill Gorcunov and committed by
Linus Torvalds
0791e364 77493f04

+112
+42
fs/eventpoll.c
··· 1077 1077 return epir; 1078 1078 } 1079 1079 1080 + static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) 1081 + { 1082 + struct rb_node *rbp; 1083 + struct epitem *epi; 1084 + 1085 + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { 1086 + epi = rb_entry(rbp, struct epitem, rbn); 1087 + if (epi->ffd.fd == tfd) { 1088 + if (toff == 0) 1089 + return epi; 1090 + else 1091 + toff--; 1092 + } 1093 + cond_resched(); 1094 + } 1095 + 1096 + return NULL; 1097 + } 1098 + 1099 + struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, 1100 + unsigned long toff) 1101 + { 1102 + struct file *file_raw; 1103 + struct eventpoll *ep; 1104 + struct epitem *epi; 1105 + 1106 + if (!is_file_epoll(file)) 1107 + return ERR_PTR(-EINVAL); 1108 + 1109 + ep = file->private_data; 1110 + 1111 + mutex_lock(&ep->mtx); 1112 + epi = ep_find_tfd(ep, tfd, toff); 1113 + if (epi) 1114 + file_raw = epi->ffd.file; 1115 + else 1116 + file_raw = ERR_PTR(-ENOENT); 1117 + mutex_unlock(&ep->mtx); 1118 + 1119 + return file_raw; 1120 + } 1121 + 1080 1122 /* 1081 1123 * This is the callback that is passed to the wait queue wakeup 1082 1124 * mechanism. It is called by the stored file descriptors when they
+3
include/linux/eventpoll.h
··· 14 14 #define _LINUX_EVENTPOLL_H 15 15 16 16 #include <uapi/linux/eventpoll.h> 17 + #include <uapi/linux/kcmp.h> 17 18 18 19 19 20 /* Forward declarations to avoid compiler errors */ ··· 22 21 23 22 24 23 #ifdef CONFIG_EPOLL 24 + 25 + struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff); 25 26 26 27 /* Used to initialize the epoll bits inside the "struct file" */ 27 28 static inline void eventpoll_init_file(struct file *file)
+10
include/uapi/linux/kcmp.h
··· 1 1 #ifndef _UAPI_LINUX_KCMP_H 2 2 #define _UAPI_LINUX_KCMP_H 3 3 4 + #include <linux/types.h> 5 + 4 6 /* Comparison type */ 5 7 enum kcmp_type { 6 8 KCMP_FILE, ··· 12 10 KCMP_SIGHAND, 13 11 KCMP_IO, 14 12 KCMP_SYSVSEM, 13 + KCMP_EPOLL_TFD, 15 14 16 15 KCMP_TYPES, 16 + }; 17 + 18 + /* Slot for KCMP_EPOLL_TFD */ 19 + struct kcmp_epoll_slot { 20 + __u32 efd; /* epoll file descriptor */ 21 + __u32 tfd; /* target file number */ 22 + __u32 toff; /* target offset within same numbered sequence */ 17 23 }; 18 24 19 25 #endif /* _UAPI_LINUX_KCMP_H */
+57
kernel/kcmp.c
··· 11 11 #include <linux/bug.h> 12 12 #include <linux/err.h> 13 13 #include <linux/kcmp.h> 14 + #include <linux/capability.h> 15 + #include <linux/list.h> 16 + #include <linux/eventpoll.h> 17 + #include <linux/file.h> 14 18 15 19 #include <asm/unistd.h> 16 20 ··· 98 94 return err; 99 95 } 100 96 97 + #ifdef CONFIG_EPOLL 98 + static int kcmp_epoll_target(struct task_struct *task1, 99 + struct task_struct *task2, 100 + unsigned long idx1, 101 + struct kcmp_epoll_slot __user *uslot) 102 + { 103 + struct file *filp, *filp_epoll, *filp_tgt; 104 + struct kcmp_epoll_slot slot; 105 + struct files_struct *files; 106 + 107 + if (copy_from_user(&slot, uslot, sizeof(slot))) 108 + return -EFAULT; 109 + 110 + filp = get_file_raw_ptr(task1, idx1); 111 + if (!filp) 112 + return -EBADF; 113 + 114 + files = get_files_struct(task2); 115 + if (!files) 116 + return -EBADF; 117 + 118 + spin_lock(&files->file_lock); 119 + filp_epoll = fcheck_files(files, slot.efd); 120 + if (filp_epoll) 121 + get_file(filp_epoll); 122 + else 123 + filp_tgt = ERR_PTR(-EBADF); 124 + spin_unlock(&files->file_lock); 125 + put_files_struct(files); 126 + 127 + if (filp_epoll) { 128 + filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); 129 + fput(filp_epoll); 130 + } else 131 + 132 + if (IS_ERR(filp_tgt)) 133 + return PTR_ERR(filp_tgt); 134 + 135 + return kcmp_ptr(filp, filp_tgt, KCMP_FILE); 136 + } 137 + #else 138 + static int kcmp_epoll_target(struct task_struct *task1, 139 + struct task_struct *task2, 140 + unsigned long idx1, 141 + struct kcmp_epoll_slot __user *uslot) 142 + { 143 + return -EOPNOTSUPP; 144 + } 145 + #endif 146 + 101 147 SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, 102 148 unsigned long, idx1, unsigned long, idx2) 103 149 { ··· 218 164 #else 219 165 ret = -EOPNOTSUPP; 220 166 #endif 167 + break; 168 + case KCMP_EPOLL_TFD: 169 + ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2); 221 170 break; 222 171 default: 223 172 ret = -EINVAL;