Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: restructure memfd code

With the addition of memfd hugetlbfs support, we now have the situation
where memfd depends on TMPFS -or- HUGETLBFS. Previously, memfd was only
supported on tmpfs, so it made sense that the code resided in shmem.c.
In the current code, memfd is only functional if TMPFS is defined. If
HUGETLFS is defined and TMPFS is not defined, then memfd functionality
will not be available for hugetlbfs. This does not cause BUGs, just a
lack of potentially desired functionality.

Code is restructured in the following way:
- include/linux/memfd.h is a new file containing memfd specific
definitions previously contained in shmem_fs.h.
- mm/memfd.c is a new file containing memfd specific code previously
contained in shmem.c.
- memfd specific code is removed from shmem_fs.h and shmem.c.
- A new config option MEMFD_CREATE is added that is defined if TMPFS
or HUGETLBFS is defined.

No functional changes are made to the code: restructuring only.

Link: http://lkml.kernel.org/r/20180415182119.4517-4-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Khalid Aziz <khalid.aziz@oracle.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Herrmann <dh.herrmann@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Marc-Andr Lureau <marcandre.lureau@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Mike Kravetz and committed by
Linus Torvalds
5d752600 c49fcfcd

+366 -338
+3
fs/Kconfig
··· 203 203 config HUGETLB_PAGE 204 204 def_bool HUGETLBFS 205 205 206 + config MEMFD_CREATE 207 + def_bool TMPFS || HUGETLBFS 208 + 206 209 config ARCH_HAS_GIGANTIC_PAGE 207 210 bool 208 211
+1 -1
fs/fcntl.c
··· 23 23 #include <linux/rcupdate.h> 24 24 #include <linux/pid_namespace.h> 25 25 #include <linux/user_namespace.h> 26 - #include <linux/shmem_fs.h> 26 + #include <linux/memfd.h> 27 27 #include <linux/compat.h> 28 28 29 29 #include <linux/poll.h>
+16
include/linux/memfd.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef __LINUX_MEMFD_H 3 + #define __LINUX_MEMFD_H 4 + 5 + #include <linux/file.h> 6 + 7 + #ifdef CONFIG_MEMFD_CREATE 8 + extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg); 9 + #else 10 + static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a) 11 + { 12 + return -EINVAL; 13 + } 14 + #endif 15 + 16 + #endif /* __LINUX_MEMFD_H */
-13
include/linux/shmem_fs.h
··· 110 110 extern bool shmem_charge(struct inode *inode, long pages); 111 111 extern void shmem_uncharge(struct inode *inode, long pages); 112 112 113 - #ifdef CONFIG_TMPFS 114 - 115 - extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg); 116 - 117 - #else 118 - 119 - static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a) 120 - { 121 - return -EINVAL; 122 - } 123 - 124 - #endif 125 - 126 113 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 127 114 extern bool shmem_huge_enabled(struct vm_area_struct *vma); 128 115 #else
+1
mm/Makefile
··· 105 105 obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o 106 106 obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o 107 107 obj-$(CONFIG_HMM) += hmm.o 108 + obj-$(CONFIG_MEMFD_CREATE) += memfd.o
+345
mm/memfd.c
··· 1 + /* 2 + * memfd_create system call and file sealing support 3 + * 4 + * Code was originally included in shmem.c, and broken out to facilitate 5 + * use by hugetlbfs as well as tmpfs. 6 + * 7 + * This file is released under the GPL. 8 + */ 9 + 10 + #include <linux/fs.h> 11 + #include <linux/vfs.h> 12 + #include <linux/pagemap.h> 13 + #include <linux/file.h> 14 + #include <linux/mm.h> 15 + #include <linux/sched/signal.h> 16 + #include <linux/khugepaged.h> 17 + #include <linux/syscalls.h> 18 + #include <linux/hugetlb.h> 19 + #include <linux/shmem_fs.h> 20 + #include <linux/memfd.h> 21 + #include <uapi/linux/memfd.h> 22 + 23 + /* 24 + * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, 25 + * so reuse a tag which we firmly believe is never set or cleared on tmpfs 26 + * or hugetlbfs because they are memory only filesystems. 27 + */ 28 + #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE 29 + #define LAST_SCAN 4 /* about 150ms max */ 30 + 31 + static void memfd_tag_pins(struct address_space *mapping) 32 + { 33 + struct radix_tree_iter iter; 34 + void __rcu **slot; 35 + pgoff_t start; 36 + struct page *page; 37 + 38 + lru_add_drain(); 39 + start = 0; 40 + rcu_read_lock(); 41 + 42 + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { 43 + page = radix_tree_deref_slot(slot); 44 + if (!page || radix_tree_exception(page)) { 45 + if (radix_tree_deref_retry(page)) { 46 + slot = radix_tree_iter_retry(&iter); 47 + continue; 48 + } 49 + } else if (page_count(page) - page_mapcount(page) > 1) { 50 + xa_lock_irq(&mapping->i_pages); 51 + radix_tree_tag_set(&mapping->i_pages, iter.index, 52 + MEMFD_TAG_PINNED); 53 + xa_unlock_irq(&mapping->i_pages); 54 + } 55 + 56 + if (need_resched()) { 57 + slot = radix_tree_iter_resume(slot, &iter); 58 + cond_resched_rcu(); 59 + } 60 + } 61 + rcu_read_unlock(); 62 + } 63 + 64 + /* 65 + * Setting SEAL_WRITE requires us to verify there's no pending writer. However, 66 + * via get_user_pages(), drivers might have some pending I/O without any active 67 + * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages 68 + * and see whether it has an elevated ref-count. If so, we tag them and wait for 69 + * them to be dropped. 70 + * The caller must guarantee that no new user will acquire writable references 71 + * to those pages to avoid races. 72 + */ 73 + static int memfd_wait_for_pins(struct address_space *mapping) 74 + { 75 + struct radix_tree_iter iter; 76 + void __rcu **slot; 77 + pgoff_t start; 78 + struct page *page; 79 + int error, scan; 80 + 81 + memfd_tag_pins(mapping); 82 + 83 + error = 0; 84 + for (scan = 0; scan <= LAST_SCAN; scan++) { 85 + if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED)) 86 + break; 87 + 88 + if (!scan) 89 + lru_add_drain_all(); 90 + else if (schedule_timeout_killable((HZ << scan) / 200)) 91 + scan = LAST_SCAN; 92 + 93 + start = 0; 94 + rcu_read_lock(); 95 + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 96 + start, MEMFD_TAG_PINNED) { 97 + 98 + page = radix_tree_deref_slot(slot); 99 + if (radix_tree_exception(page)) { 100 + if (radix_tree_deref_retry(page)) { 101 + slot = radix_tree_iter_retry(&iter); 102 + continue; 103 + } 104 + 105 + page = NULL; 106 + } 107 + 108 + if (page && 109 + page_count(page) - page_mapcount(page) != 1) { 110 + if (scan < LAST_SCAN) 111 + goto continue_resched; 112 + 113 + /* 114 + * On the last scan, we clean up all those tags 115 + * we inserted; but make a note that we still 116 + * found pages pinned. 117 + */ 118 + error = -EBUSY; 119 + } 120 + 121 + xa_lock_irq(&mapping->i_pages); 122 + radix_tree_tag_clear(&mapping->i_pages, 123 + iter.index, MEMFD_TAG_PINNED); 124 + xa_unlock_irq(&mapping->i_pages); 125 + continue_resched: 126 + if (need_resched()) { 127 + slot = radix_tree_iter_resume(slot, &iter); 128 + cond_resched_rcu(); 129 + } 130 + } 131 + rcu_read_unlock(); 132 + } 133 + 134 + return error; 135 + } 136 + 137 + static unsigned int *memfd_file_seals_ptr(struct file *file) 138 + { 139 + if (shmem_file(file)) 140 + return &SHMEM_I(file_inode(file))->seals; 141 + 142 + #ifdef CONFIG_HUGETLBFS 143 + if (is_file_hugepages(file)) 144 + return &HUGETLBFS_I(file_inode(file))->seals; 145 + #endif 146 + 147 + return NULL; 148 + } 149 + 150 + #define F_ALL_SEALS (F_SEAL_SEAL | \ 151 + F_SEAL_SHRINK | \ 152 + F_SEAL_GROW | \ 153 + F_SEAL_WRITE) 154 + 155 + static int memfd_add_seals(struct file *file, unsigned int seals) 156 + { 157 + struct inode *inode = file_inode(file); 158 + unsigned int *file_seals; 159 + int error; 160 + 161 + /* 162 + * SEALING 163 + * Sealing allows multiple parties to share a tmpfs or hugetlbfs file 164 + * but restrict access to a specific subset of file operations. Seals 165 + * can only be added, but never removed. This way, mutually untrusted 166 + * parties can share common memory regions with a well-defined policy. 167 + * A malicious peer can thus never perform unwanted operations on a 168 + * shared object. 169 + * 170 + * Seals are only supported on special tmpfs or hugetlbfs files and 171 + * always affect the whole underlying inode. Once a seal is set, it 172 + * may prevent some kinds of access to the file. Currently, the 173 + * following seals are defined: 174 + * SEAL_SEAL: Prevent further seals from being set on this file 175 + * SEAL_SHRINK: Prevent the file from shrinking 176 + * SEAL_GROW: Prevent the file from growing 177 + * SEAL_WRITE: Prevent write access to the file 178 + * 179 + * As we don't require any trust relationship between two parties, we 180 + * must prevent seals from being removed. Therefore, sealing a file 181 + * only adds a given set of seals to the file, it never touches 182 + * existing seals. Furthermore, the "setting seals"-operation can be 183 + * sealed itself, which basically prevents any further seal from being 184 + * added. 185 + * 186 + * Semantics of sealing are only defined on volatile files. Only 187 + * anonymous tmpfs and hugetlbfs files support sealing. More 188 + * importantly, seals are never written to disk. Therefore, there's 189 + * no plan to support it on other file types. 190 + */ 191 + 192 + if (!(file->f_mode & FMODE_WRITE)) 193 + return -EPERM; 194 + if (seals & ~(unsigned int)F_ALL_SEALS) 195 + return -EINVAL; 196 + 197 + inode_lock(inode); 198 + 199 + file_seals = memfd_file_seals_ptr(file); 200 + if (!file_seals) { 201 + error = -EINVAL; 202 + goto unlock; 203 + } 204 + 205 + if (*file_seals & F_SEAL_SEAL) { 206 + error = -EPERM; 207 + goto unlock; 208 + } 209 + 210 + if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { 211 + error = mapping_deny_writable(file->f_mapping); 212 + if (error) 213 + goto unlock; 214 + 215 + error = memfd_wait_for_pins(file->f_mapping); 216 + if (error) { 217 + mapping_allow_writable(file->f_mapping); 218 + goto unlock; 219 + } 220 + } 221 + 222 + *file_seals |= seals; 223 + error = 0; 224 + 225 + unlock: 226 + inode_unlock(inode); 227 + return error; 228 + } 229 + 230 + static int memfd_get_seals(struct file *file) 231 + { 232 + unsigned int *seals = memfd_file_seals_ptr(file); 233 + 234 + return seals ? *seals : -EINVAL; 235 + } 236 + 237 + long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 238 + { 239 + long error; 240 + 241 + switch (cmd) { 242 + case F_ADD_SEALS: 243 + /* disallow upper 32bit */ 244 + if (arg > UINT_MAX) 245 + return -EINVAL; 246 + 247 + error = memfd_add_seals(file, arg); 248 + break; 249 + case F_GET_SEALS: 250 + error = memfd_get_seals(file); 251 + break; 252 + default: 253 + error = -EINVAL; 254 + break; 255 + } 256 + 257 + return error; 258 + } 259 + 260 + #define MFD_NAME_PREFIX "memfd:" 261 + #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) 262 + #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) 263 + 264 + #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) 265 + 266 + SYSCALL_DEFINE2(memfd_create, 267 + const char __user *, uname, 268 + unsigned int, flags) 269 + { 270 + unsigned int *file_seals; 271 + struct file *file; 272 + int fd, error; 273 + char *name; 274 + long len; 275 + 276 + if (!(flags & MFD_HUGETLB)) { 277 + if (flags & ~(unsigned int)MFD_ALL_FLAGS) 278 + return -EINVAL; 279 + } else { 280 + /* Allow huge page size encoding in flags. */ 281 + if (flags & ~(unsigned int)(MFD_ALL_FLAGS | 282 + (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) 283 + return -EINVAL; 284 + } 285 + 286 + /* length includes terminating zero */ 287 + len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); 288 + if (len <= 0) 289 + return -EFAULT; 290 + if (len > MFD_NAME_MAX_LEN + 1) 291 + return -EINVAL; 292 + 293 + name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); 294 + if (!name) 295 + return -ENOMEM; 296 + 297 + strcpy(name, MFD_NAME_PREFIX); 298 + if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { 299 + error = -EFAULT; 300 + goto err_name; 301 + } 302 + 303 + /* terminating-zero may have changed after strnlen_user() returned */ 304 + if (name[len + MFD_NAME_PREFIX_LEN - 1]) { 305 + error = -EFAULT; 306 + goto err_name; 307 + } 308 + 309 + fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); 310 + if (fd < 0) { 311 + error = fd; 312 + goto err_name; 313 + } 314 + 315 + if (flags & MFD_HUGETLB) { 316 + struct user_struct *user = NULL; 317 + 318 + file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, 319 + HUGETLB_ANONHUGE_INODE, 320 + (flags >> MFD_HUGE_SHIFT) & 321 + MFD_HUGE_MASK); 322 + } else 323 + file = shmem_file_setup(name, 0, VM_NORESERVE); 324 + if (IS_ERR(file)) { 325 + error = PTR_ERR(file); 326 + goto err_fd; 327 + } 328 + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; 329 + file->f_flags |= O_RDWR | O_LARGEFILE; 330 + 331 + if (flags & MFD_ALLOW_SEALING) { 332 + file_seals = memfd_file_seals_ptr(file); 333 + *file_seals &= ~F_SEAL_SEAL; 334 + } 335 + 336 + fd_install(fd, file); 337 + kfree(name); 338 + return fd; 339 + 340 + err_fd: 341 + put_unused_fd(fd); 342 + err_name: 343 + kfree(name); 344 + return error; 345 + }
-324
mm/shmem.c
··· 2618 2618 return offset; 2619 2619 } 2620 2620 2621 - /* 2622 - * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, 2623 - * so reuse a tag which we firmly believe is never set or cleared on tmpfs 2624 - * or hugetlbfs because they are memory only filesystems. 2625 - */ 2626 - #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE 2627 - #define LAST_SCAN 4 /* about 150ms max */ 2628 - 2629 - static void memfd_tag_pins(struct address_space *mapping) 2630 - { 2631 - struct radix_tree_iter iter; 2632 - void __rcu **slot; 2633 - pgoff_t start; 2634 - struct page *page; 2635 - 2636 - lru_add_drain(); 2637 - start = 0; 2638 - rcu_read_lock(); 2639 - 2640 - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { 2641 - page = radix_tree_deref_slot(slot); 2642 - if (!page || radix_tree_exception(page)) { 2643 - if (radix_tree_deref_retry(page)) { 2644 - slot = radix_tree_iter_retry(&iter); 2645 - continue; 2646 - } 2647 - } else if (page_count(page) - page_mapcount(page) > 1) { 2648 - xa_lock_irq(&mapping->i_pages); 2649 - radix_tree_tag_set(&mapping->i_pages, iter.index, 2650 - MEMFD_TAG_PINNED); 2651 - xa_unlock_irq(&mapping->i_pages); 2652 - } 2653 - 2654 - if (need_resched()) { 2655 - slot = radix_tree_iter_resume(slot, &iter); 2656 - cond_resched_rcu(); 2657 - } 2658 - } 2659 - rcu_read_unlock(); 2660 - } 2661 - 2662 - /* 2663 - * Setting SEAL_WRITE requires us to verify there's no pending writer. However, 2664 - * via get_user_pages(), drivers might have some pending I/O without any active 2665 - * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages 2666 - * and see whether it has an elevated ref-count. If so, we tag them and wait for 2667 - * them to be dropped. 2668 - * The caller must guarantee that no new user will acquire writable references 2669 - * to those pages to avoid races. 2670 - */ 2671 - static int memfd_wait_for_pins(struct address_space *mapping) 2672 - { 2673 - struct radix_tree_iter iter; 2674 - void __rcu **slot; 2675 - pgoff_t start; 2676 - struct page *page; 2677 - int error, scan; 2678 - 2679 - memfd_tag_pins(mapping); 2680 - 2681 - error = 0; 2682 - for (scan = 0; scan <= LAST_SCAN; scan++) { 2683 - if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED)) 2684 - break; 2685 - 2686 - if (!scan) 2687 - lru_add_drain_all(); 2688 - else if (schedule_timeout_killable((HZ << scan) / 200)) 2689 - scan = LAST_SCAN; 2690 - 2691 - start = 0; 2692 - rcu_read_lock(); 2693 - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 2694 - start, MEMFD_TAG_PINNED) { 2695 - 2696 - page = radix_tree_deref_slot(slot); 2697 - if (radix_tree_exception(page)) { 2698 - if (radix_tree_deref_retry(page)) { 2699 - slot = radix_tree_iter_retry(&iter); 2700 - continue; 2701 - } 2702 - 2703 - page = NULL; 2704 - } 2705 - 2706 - if (page && 2707 - page_count(page) - page_mapcount(page) != 1) { 2708 - if (scan < LAST_SCAN) 2709 - goto continue_resched; 2710 - 2711 - /* 2712 - * On the last scan, we clean up all those tags 2713 - * we inserted; but make a note that we still 2714 - * found pages pinned. 2715 - */ 2716 - error = -EBUSY; 2717 - } 2718 - 2719 - xa_lock_irq(&mapping->i_pages); 2720 - radix_tree_tag_clear(&mapping->i_pages, 2721 - iter.index, MEMFD_TAG_PINNED); 2722 - xa_unlock_irq(&mapping->i_pages); 2723 - continue_resched: 2724 - if (need_resched()) { 2725 - slot = radix_tree_iter_resume(slot, &iter); 2726 - cond_resched_rcu(); 2727 - } 2728 - } 2729 - rcu_read_unlock(); 2730 - } 2731 - 2732 - return error; 2733 - } 2734 - 2735 - static unsigned int *memfd_file_seals_ptr(struct file *file) 2736 - { 2737 - if (shmem_file(file)) 2738 - return &SHMEM_I(file_inode(file))->seals; 2739 - 2740 - #ifdef CONFIG_HUGETLBFS 2741 - if (is_file_hugepages(file)) 2742 - return &HUGETLBFS_I(file_inode(file))->seals; 2743 - #endif 2744 - 2745 - return NULL; 2746 - } 2747 - 2748 - #define F_ALL_SEALS (F_SEAL_SEAL | \ 2749 - F_SEAL_SHRINK | \ 2750 - F_SEAL_GROW | \ 2751 - F_SEAL_WRITE) 2752 - 2753 - static int memfd_add_seals(struct file *file, unsigned int seals) 2754 - { 2755 - struct inode *inode = file_inode(file); 2756 - unsigned int *file_seals; 2757 - int error; 2758 - 2759 - /* 2760 - * SEALING 2761 - * Sealing allows multiple parties to share a tmpfs or hugetlbfs file 2762 - * but restrict access to a specific subset of file operations. Seals 2763 - * can only be added, but never removed. This way, mutually untrusted 2764 - * parties can share common memory regions with a well-defined policy. 2765 - * A malicious peer can thus never perform unwanted operations on a 2766 - * shared object. 2767 - * 2768 - * Seals are only supported on special tmpfs or hugetlbfs files and 2769 - * always affect the whole underlying inode. Once a seal is set, it 2770 - * may prevent some kinds of access to the file. Currently, the 2771 - * following seals are defined: 2772 - * SEAL_SEAL: Prevent further seals from being set on this file 2773 - * SEAL_SHRINK: Prevent the file from shrinking 2774 - * SEAL_GROW: Prevent the file from growing 2775 - * SEAL_WRITE: Prevent write access to the file 2776 - * 2777 - * As we don't require any trust relationship between two parties, we 2778 - * must prevent seals from being removed. Therefore, sealing a file 2779 - * only adds a given set of seals to the file, it never touches 2780 - * existing seals. Furthermore, the "setting seals"-operation can be 2781 - * sealed itself, which basically prevents any further seal from being 2782 - * added. 2783 - * 2784 - * Semantics of sealing are only defined on volatile files. Only 2785 - * anonymous tmpfs and hugetlbfs files support sealing. More 2786 - * importantly, seals are never written to disk. Therefore, there's 2787 - * no plan to support it on other file types. 2788 - */ 2789 - 2790 - if (!(file->f_mode & FMODE_WRITE)) 2791 - return -EPERM; 2792 - if (seals & ~(unsigned int)F_ALL_SEALS) 2793 - return -EINVAL; 2794 - 2795 - inode_lock(inode); 2796 - 2797 - file_seals = memfd_file_seals_ptr(file); 2798 - if (!file_seals) { 2799 - error = -EINVAL; 2800 - goto unlock; 2801 - } 2802 - 2803 - if (*file_seals & F_SEAL_SEAL) { 2804 - error = -EPERM; 2805 - goto unlock; 2806 - } 2807 - 2808 - if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { 2809 - error = mapping_deny_writable(file->f_mapping); 2810 - if (error) 2811 - goto unlock; 2812 - 2813 - error = memfd_wait_for_pins(file->f_mapping); 2814 - if (error) { 2815 - mapping_allow_writable(file->f_mapping); 2816 - goto unlock; 2817 - } 2818 - } 2819 - 2820 - *file_seals |= seals; 2821 - error = 0; 2822 - 2823 - unlock: 2824 - inode_unlock(inode); 2825 - return error; 2826 - } 2827 - 2828 - static int memfd_get_seals(struct file *file) 2829 - { 2830 - unsigned int *seals = memfd_file_seals_ptr(file); 2831 - 2832 - return seals ? *seals : -EINVAL; 2833 - } 2834 - 2835 - long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 2836 - { 2837 - long error; 2838 - 2839 - switch (cmd) { 2840 - case F_ADD_SEALS: 2841 - /* disallow upper 32bit */ 2842 - if (arg > UINT_MAX) 2843 - return -EINVAL; 2844 - 2845 - error = memfd_add_seals(file, arg); 2846 - break; 2847 - case F_GET_SEALS: 2848 - error = memfd_get_seals(file); 2849 - break; 2850 - default: 2851 - error = -EINVAL; 2852 - break; 2853 - } 2854 - 2855 - return error; 2856 - } 2857 - 2858 2621 static long shmem_fallocate(struct file *file, int mode, loff_t offset, 2859 2622 loff_t len) 2860 2623 { ··· 3438 3675 #endif 3439 3676 shmem_show_mpol(seq, sbinfo->mpol); 3440 3677 return 0; 3441 - } 3442 - 3443 - #define MFD_NAME_PREFIX "memfd:" 3444 - #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) 3445 - #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) 3446 - 3447 - #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) 3448 - 3449 - SYSCALL_DEFINE2(memfd_create, 3450 - const char __user *, uname, 3451 - unsigned int, flags) 3452 - { 3453 - unsigned int *file_seals; 3454 - struct file *file; 3455 - int fd, error; 3456 - char *name; 3457 - long len; 3458 - 3459 - if (!(flags & MFD_HUGETLB)) { 3460 - if (flags & ~(unsigned int)MFD_ALL_FLAGS) 3461 - return -EINVAL; 3462 - } else { 3463 - /* Allow huge page size encoding in flags. */ 3464 - if (flags & ~(unsigned int)(MFD_ALL_FLAGS | 3465 - (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) 3466 - return -EINVAL; 3467 - } 3468 - 3469 - /* length includes terminating zero */ 3470 - len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); 3471 - if (len <= 0) 3472 - return -EFAULT; 3473 - if (len > MFD_NAME_MAX_LEN + 1) 3474 - return -EINVAL; 3475 - 3476 - name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); 3477 - if (!name) 3478 - return -ENOMEM; 3479 - 3480 - strcpy(name, MFD_NAME_PREFIX); 3481 - if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { 3482 - error = -EFAULT; 3483 - goto err_name; 3484 - } 3485 - 3486 - /* terminating-zero may have changed after strnlen_user() returned */ 3487 - if (name[len + MFD_NAME_PREFIX_LEN - 1]) { 3488 - error = -EFAULT; 3489 - goto err_name; 3490 - } 3491 - 3492 - fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); 3493 - if (fd < 0) { 3494 - error = fd; 3495 - goto err_name; 3496 - } 3497 - 3498 - if (flags & MFD_HUGETLB) { 3499 - struct user_struct *user = NULL; 3500 - 3501 - file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, 3502 - HUGETLB_ANONHUGE_INODE, 3503 - (flags >> MFD_HUGE_SHIFT) & 3504 - MFD_HUGE_MASK); 3505 - } else 3506 - file = shmem_file_setup(name, 0, VM_NORESERVE); 3507 - if (IS_ERR(file)) { 3508 - error = PTR_ERR(file); 3509 - goto err_fd; 3510 - } 3511 - file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; 3512 - file->f_flags |= O_RDWR | O_LARGEFILE; 3513 - 3514 - if (flags & MFD_ALLOW_SEALING) { 3515 - file_seals = memfd_file_seals_ptr(file); 3516 - *file_seals &= ~F_SEAL_SEAL; 3517 - } 3518 - 3519 - fd_install(fd, file); 3520 - kfree(name); 3521 - return fd; 3522 - 3523 - err_fd: 3524 - put_unused_fd(fd); 3525 - err_name: 3526 - kfree(name); 3527 - return error; 3528 3678 } 3529 3679 3530 3680 #endif /* CONFIG_TMPFS */