Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

procfs: introduce the /proc/<pid>/map_files/ directory

This one behaves similarly to the /proc/<pid>/fd/ one - it contains
symlinks one for each mapping with file, the name of a symlink is
"vma->vm_start-vma->vm_end", the target is the file. Opening a symlink
results in a file that point exactly to the same inode as them vma's one.

For example the ls -l of some arbitrary /proc/<pid>/map_files/

| lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
| lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
| lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
| lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
| lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so

This *helps* checkpointing process in three ways:

1. When dumping a task mappings we do know exact file that is mapped
by particular region. We do this by opening
/proc/$pid/map_files/$address symlink the way we do with file
descriptors.

2. This also helps in determining which anonymous shared mappings are
shared with each other by comparing the inodes of them.

3. When restoring a set of processes in case two of them has a mapping
shared, we map the memory by the 1st one and then open its
/proc/$pid/map_files/$address file and map it by the 2nd task.

Using /proc/$pid/maps for this is quite inconvenient since it brings
repeatable re-reading and reparsing for this text file which slows down
restore procedure significantly. Also as being pointed in (3) it is a way
easier to use top level shared mapping in children as
/proc/$pid/map_files/$address when needed.

[akpm@linux-foundation.org: coding-style fixes]
[gorcunov@openvz.org: make map_files depend on CHECKPOINT_RESTORE]
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Reviewed-by: Vasiliy Kulikov <segoon@openwall.com>
Reviewed-by: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Tejun Heo <tj@kernel.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Pavel Emelyanov and committed by
Linus Torvalds
640708a2 7773fbc5

+367
+355
fs/proc/base.c
··· 83 83 #include <linux/pid_namespace.h> 84 84 #include <linux/fs_struct.h> 85 85 #include <linux/slab.h> 86 + #include <linux/flex_array.h> 86 87 #ifdef CONFIG_HARDWALL 87 88 #include <asm/hardwall.h> 88 89 #endif ··· 134 133 NOD(NAME, (S_IFREG|(MODE)), \ 135 134 NULL, &proc_single_file_operations, \ 136 135 { .proc_show = show } ) 136 + 137 + static int proc_fd_permission(struct inode *inode, int mask); 137 138 138 139 /* 139 140 * Count the number of hardlinks for the pid_entry table, excluding the . ··· 2049 2046 .llseek = default_llseek, 2050 2047 }; 2051 2048 2049 + #ifdef CONFIG_CHECKPOINT_RESTORE 2050 + 2051 + /* 2052 + * dname_to_vma_addr - maps a dentry name into two unsigned longs 2053 + * which represent vma start and end addresses. 2054 + */ 2055 + static int dname_to_vma_addr(struct dentry *dentry, 2056 + unsigned long *start, unsigned long *end) 2057 + { 2058 + if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2) 2059 + return -EINVAL; 2060 + 2061 + return 0; 2062 + } 2063 + 2064 + static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd) 2065 + { 2066 + unsigned long vm_start, vm_end; 2067 + bool exact_vma_exists = false; 2068 + struct mm_struct *mm = NULL; 2069 + struct task_struct *task; 2070 + const struct cred *cred; 2071 + struct inode *inode; 2072 + int status = 0; 2073 + 2074 + if (nd && nd->flags & LOOKUP_RCU) 2075 + return -ECHILD; 2076 + 2077 + if (!capable(CAP_SYS_ADMIN)) { 2078 + status = -EACCES; 2079 + goto out_notask; 2080 + } 2081 + 2082 + inode = dentry->d_inode; 2083 + task = get_proc_task(inode); 2084 + if (!task) 2085 + goto out_notask; 2086 + 2087 + if (!ptrace_may_access(task, PTRACE_MODE_READ)) 2088 + goto out; 2089 + 2090 + mm = get_task_mm(task); 2091 + if (!mm) 2092 + goto out; 2093 + 2094 + if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { 2095 + down_read(&mm->mmap_sem); 2096 + exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end); 2097 + up_read(&mm->mmap_sem); 2098 + } 2099 + 2100 + mmput(mm); 2101 + 2102 + if (exact_vma_exists) { 2103 + if (task_dumpable(task)) { 2104 + rcu_read_lock(); 2105 + cred = __task_cred(task); 2106 + inode->i_uid = cred->euid; 2107 + inode->i_gid = cred->egid; 2108 + rcu_read_unlock(); 2109 + } else { 2110 + inode->i_uid = 0; 2111 + inode->i_gid = 0; 2112 + } 2113 + security_task_to_inode(task, inode); 2114 + status = 1; 2115 + } 2116 + 2117 + out: 2118 + put_task_struct(task); 2119 + 2120 + out_notask: 2121 + if (status <= 0) 2122 + d_drop(dentry); 2123 + 2124 + return status; 2125 + } 2126 + 2127 + static const struct dentry_operations tid_map_files_dentry_operations = { 2128 + .d_revalidate = map_files_d_revalidate, 2129 + .d_delete = pid_delete_dentry, 2130 + }; 2131 + 2132 + static int proc_map_files_get_link(struct dentry *dentry, struct path *path) 2133 + { 2134 + unsigned long vm_start, vm_end; 2135 + struct vm_area_struct *vma; 2136 + struct task_struct *task; 2137 + struct mm_struct *mm; 2138 + int rc; 2139 + 2140 + rc = -ENOENT; 2141 + task = get_proc_task(dentry->d_inode); 2142 + if (!task) 2143 + goto out; 2144 + 2145 + mm = get_task_mm(task); 2146 + put_task_struct(task); 2147 + if (!mm) 2148 + goto out; 2149 + 2150 + rc = dname_to_vma_addr(dentry, &vm_start, &vm_end); 2151 + if (rc) 2152 + goto out_mmput; 2153 + 2154 + down_read(&mm->mmap_sem); 2155 + vma = find_exact_vma(mm, vm_start, vm_end); 2156 + if (vma && vma->vm_file) { 2157 + *path = vma->vm_file->f_path; 2158 + path_get(path); 2159 + rc = 0; 2160 + } 2161 + up_read(&mm->mmap_sem); 2162 + 2163 + out_mmput: 2164 + mmput(mm); 2165 + out: 2166 + return rc; 2167 + } 2168 + 2169 + struct map_files_info { 2170 + struct file *file; 2171 + unsigned long len; 2172 + unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ 2173 + }; 2174 + 2175 + static struct dentry * 2176 + proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, 2177 + struct task_struct *task, const void *ptr) 2178 + { 2179 + const struct file *file = ptr; 2180 + struct proc_inode *ei; 2181 + struct inode *inode; 2182 + 2183 + if (!file) 2184 + return ERR_PTR(-ENOENT); 2185 + 2186 + inode = proc_pid_make_inode(dir->i_sb, task); 2187 + if (!inode) 2188 + return ERR_PTR(-ENOENT); 2189 + 2190 + ei = PROC_I(inode); 2191 + ei->op.proc_get_link = proc_map_files_get_link; 2192 + 2193 + inode->i_op = &proc_pid_link_inode_operations; 2194 + inode->i_size = 64; 2195 + inode->i_mode = S_IFLNK; 2196 + 2197 + if (file->f_mode & FMODE_READ) 2198 + inode->i_mode |= S_IRUSR; 2199 + if (file->f_mode & FMODE_WRITE) 2200 + inode->i_mode |= S_IWUSR; 2201 + 2202 + d_set_d_op(dentry, &tid_map_files_dentry_operations); 2203 + d_add(dentry, inode); 2204 + 2205 + return NULL; 2206 + } 2207 + 2208 + static struct dentry *proc_map_files_lookup(struct inode *dir, 2209 + struct dentry *dentry, struct nameidata *nd) 2210 + { 2211 + unsigned long vm_start, vm_end; 2212 + struct vm_area_struct *vma; 2213 + struct task_struct *task; 2214 + struct dentry *result; 2215 + struct mm_struct *mm; 2216 + 2217 + result = ERR_PTR(-EACCES); 2218 + if (!capable(CAP_SYS_ADMIN)) 2219 + goto out; 2220 + 2221 + result = ERR_PTR(-ENOENT); 2222 + task = get_proc_task(dir); 2223 + if (!task) 2224 + goto out; 2225 + 2226 + result = ERR_PTR(-EACCES); 2227 + if (lock_trace(task)) 2228 + goto out_put_task; 2229 + 2230 + result = ERR_PTR(-ENOENT); 2231 + if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) 2232 + goto out_unlock; 2233 + 2234 + mm = get_task_mm(task); 2235 + if (!mm) 2236 + goto out_unlock; 2237 + 2238 + down_read(&mm->mmap_sem); 2239 + vma = find_exact_vma(mm, vm_start, vm_end); 2240 + if (!vma) 2241 + goto out_no_vma; 2242 + 2243 + result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file); 2244 + 2245 + out_no_vma: 2246 + up_read(&mm->mmap_sem); 2247 + mmput(mm); 2248 + out_unlock: 2249 + unlock_trace(task); 2250 + out_put_task: 2251 + put_task_struct(task); 2252 + out: 2253 + return result; 2254 + } 2255 + 2256 + static const struct inode_operations proc_map_files_inode_operations = { 2257 + .lookup = proc_map_files_lookup, 2258 + .permission = proc_fd_permission, 2259 + .setattr = proc_setattr, 2260 + }; 2261 + 2262 + static int 2263 + proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) 2264 + { 2265 + struct dentry *dentry = filp->f_path.dentry; 2266 + struct inode *inode = dentry->d_inode; 2267 + struct vm_area_struct *vma; 2268 + struct task_struct *task; 2269 + struct mm_struct *mm; 2270 + ino_t ino; 2271 + int ret; 2272 + 2273 + ret = -EACCES; 2274 + if (!capable(CAP_SYS_ADMIN)) 2275 + goto out; 2276 + 2277 + ret = -ENOENT; 2278 + task = get_proc_task(inode); 2279 + if (!task) 2280 + goto out; 2281 + 2282 + ret = -EACCES; 2283 + if (lock_trace(task)) 2284 + goto out_put_task; 2285 + 2286 + ret = 0; 2287 + switch (filp->f_pos) { 2288 + case 0: 2289 + ino = inode->i_ino; 2290 + if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0) 2291 + goto out_unlock; 2292 + filp->f_pos++; 2293 + case 1: 2294 + ino = parent_ino(dentry); 2295 + if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) 2296 + goto out_unlock; 2297 + filp->f_pos++; 2298 + default: 2299 + { 2300 + unsigned long nr_files, pos, i; 2301 + struct flex_array *fa = NULL; 2302 + struct map_files_info info; 2303 + struct map_files_info *p; 2304 + 2305 + mm = get_task_mm(task); 2306 + if (!mm) 2307 + goto out_unlock; 2308 + down_read(&mm->mmap_sem); 2309 + 2310 + nr_files = 0; 2311 + 2312 + /* 2313 + * We need two passes here: 2314 + * 2315 + * 1) Collect vmas of mapped files with mmap_sem taken 2316 + * 2) Release mmap_sem and instantiate entries 2317 + * 2318 + * otherwise we get lockdep complained, since filldir() 2319 + * routine might require mmap_sem taken in might_fault(). 2320 + */ 2321 + 2322 + for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { 2323 + if (vma->vm_file && ++pos > filp->f_pos) 2324 + nr_files++; 2325 + } 2326 + 2327 + if (nr_files) { 2328 + fa = flex_array_alloc(sizeof(info), nr_files, 2329 + GFP_KERNEL); 2330 + if (!fa || flex_array_prealloc(fa, 0, nr_files, 2331 + GFP_KERNEL)) { 2332 + ret = -ENOMEM; 2333 + if (fa) 2334 + flex_array_free(fa); 2335 + up_read(&mm->mmap_sem); 2336 + mmput(mm); 2337 + goto out_unlock; 2338 + } 2339 + for (i = 0, vma = mm->mmap, pos = 2; vma; 2340 + vma = vma->vm_next) { 2341 + if (!vma->vm_file) 2342 + continue; 2343 + if (++pos <= filp->f_pos) 2344 + continue; 2345 + 2346 + get_file(vma->vm_file); 2347 + info.file = vma->vm_file; 2348 + info.len = snprintf(info.name, 2349 + sizeof(info.name), "%lx-%lx", 2350 + vma->vm_start, vma->vm_end); 2351 + if (flex_array_put(fa, i++, &info, GFP_KERNEL)) 2352 + BUG(); 2353 + } 2354 + } 2355 + up_read(&mm->mmap_sem); 2356 + 2357 + for (i = 0; i < nr_files; i++) { 2358 + p = flex_array_get(fa, i); 2359 + ret = proc_fill_cache(filp, dirent, filldir, 2360 + p->name, p->len, 2361 + proc_map_files_instantiate, 2362 + task, p->file); 2363 + if (ret) 2364 + break; 2365 + filp->f_pos++; 2366 + fput(p->file); 2367 + } 2368 + for (; i < nr_files; i++) { 2369 + /* 2370 + * In case of error don't forget 2371 + * to put rest of file refs. 2372 + */ 2373 + p = flex_array_get(fa, i); 2374 + fput(p->file); 2375 + } 2376 + if (fa) 2377 + flex_array_free(fa); 2378 + mmput(mm); 2379 + } 2380 + } 2381 + 2382 + out_unlock: 2383 + unlock_trace(task); 2384 + out_put_task: 2385 + put_task_struct(task); 2386 + out: 2387 + return ret; 2388 + } 2389 + 2390 + static const struct file_operations proc_map_files_operations = { 2391 + .read = generic_read_dir, 2392 + .readdir = proc_map_files_readdir, 2393 + .llseek = default_llseek, 2394 + }; 2395 + 2396 + #endif /* CONFIG_CHECKPOINT_RESTORE */ 2397 + 2052 2398 /* 2053 2399 * /proc/pid/fd needs a special permission handler so that a process can still 2054 2400 * access /proc/self/fd after it has executed a setuid(). ··· 3013 2661 static const struct pid_entry tgid_base_stuff[] = { 3014 2662 DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), 3015 2663 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), 2664 + #ifdef CONFIG_CHECKPOINT_RESTORE 2665 + DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), 2666 + #endif 3016 2667 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), 3017 2668 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), 3018 2669 #ifdef CONFIG_NET
+12
include/linux/mm.h
··· 1482 1482 return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; 1483 1483 } 1484 1484 1485 + /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ 1486 + static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, 1487 + unsigned long vm_start, unsigned long vm_end) 1488 + { 1489 + struct vm_area_struct *vma = find_vma(mm, vm_start); 1490 + 1491 + if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end)) 1492 + vma = NULL; 1493 + 1494 + return vma; 1495 + } 1496 + 1485 1497 #ifdef CONFIG_MMU 1486 1498 pgprot_t vm_get_page_prot(unsigned long vm_flags); 1487 1499 #else