Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

shm: add memfd_create() syscall

memfd_create() is similar to mmap(MAP_ANON), but returns a file-descriptor
that you can pass to mmap(). It can support sealing and avoids any
connection to user-visible mount-points. Thus, it's not subject to quotas
on mounted file-systems, but can be used like malloc()'ed memory, but with
a file-descriptor to it.

memfd_create() returns the raw shmem file, so calls like ftruncate() can
be used to modify the underlying inode. Also calls like fstat() will
return proper information and mark the file as regular file. If you want
sealing, you can specify MFD_ALLOW_SEALING. Otherwise, sealing is not
supported (like on all other regular files).

Compared to O_TMPFILE, it does not require a tmpfs mount-point and is not
subject to a filesystem size limit. It is still properly accounted to
memcg limits, though, and to the same overcommit or no-overcommit
accounting as all user memory.

Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Ryan Lortie <desrt@desrt.ca>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Daniel Mack <zonque@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

David Herrmann and committed by
Linus Torvalds
9183df25 40e041a2

+85
+1
arch/x86/syscalls/syscall_32.tbl
··· 362 362 353 i386 renameat2 sys_renameat2 363 363 354 i386 seccomp sys_seccomp 364 364 355 i386 getrandom sys_getrandom 365 + 356 i386 memfd_create sys_memfd_create
+1
arch/x86/syscalls/syscall_64.tbl
··· 325 325 316 common renameat2 sys_renameat2 326 326 317 common seccomp sys_seccomp 327 327 318 common getrandom sys_getrandom 328 + 319 common memfd_create sys_memfd_create 328 329 329 330 # 330 331 # x32-specific system call numbers start at 512 to avoid cache impact
+1
include/linux/syscalls.h
··· 802 802 asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr); 803 803 asmlinkage long sys_eventfd(unsigned int count); 804 804 asmlinkage long sys_eventfd2(unsigned int count, int flags); 805 + asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags); 805 806 asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); 806 807 asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int); 807 808 asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
+8
include/uapi/linux/memfd.h
··· 1 + #ifndef _UAPI_LINUX_MEMFD_H 2 + #define _UAPI_LINUX_MEMFD_H 3 + 4 + /* flags for memfd_create(2) (unsigned int) */ 5 + #define MFD_CLOEXEC 0x0001U 6 + #define MFD_ALLOW_SEALING 0x0002U 7 + 8 + #endif /* _UAPI_LINUX_MEMFD_H */
+1
kernel/sys_ni.c
··· 197 197 cond_syscall(compat_sys_timerfd_gettime); 198 198 cond_syscall(sys_eventfd); 199 199 cond_syscall(sys_eventfd2); 200 + cond_syscall(sys_memfd_create); 200 201 201 202 /* performance counters: */ 202 203 cond_syscall(sys_perf_event_open);
+73
mm/shmem.c
··· 66 66 #include <linux/highmem.h> 67 67 #include <linux/seq_file.h> 68 68 #include <linux/magic.h> 69 + #include <linux/syscalls.h> 69 70 #include <linux/fcntl.h> 71 + #include <uapi/linux/memfd.h> 70 72 71 73 #include <asm/uaccess.h> 72 74 #include <asm/pgtable.h> ··· 2734 2732 shmem_show_mpol(seq, sbinfo->mpol); 2735 2733 return 0; 2736 2734 } 2735 + 2736 + #define MFD_NAME_PREFIX "memfd:" 2737 + #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) 2738 + #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) 2739 + 2740 + #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING) 2741 + 2742 + SYSCALL_DEFINE2(memfd_create, 2743 + const char __user *, uname, 2744 + unsigned int, flags) 2745 + { 2746 + struct shmem_inode_info *info; 2747 + struct file *file; 2748 + int fd, error; 2749 + char *name; 2750 + long len; 2751 + 2752 + if (flags & ~(unsigned int)MFD_ALL_FLAGS) 2753 + return -EINVAL; 2754 + 2755 + /* length includes terminating zero */ 2756 + len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); 2757 + if (len <= 0) 2758 + return -EFAULT; 2759 + if (len > MFD_NAME_MAX_LEN + 1) 2760 + return -EINVAL; 2761 + 2762 + name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY); 2763 + if (!name) 2764 + return -ENOMEM; 2765 + 2766 + strcpy(name, MFD_NAME_PREFIX); 2767 + if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { 2768 + error = -EFAULT; 2769 + goto err_name; 2770 + } 2771 + 2772 + /* terminating-zero may have changed after strnlen_user() returned */ 2773 + if (name[len + MFD_NAME_PREFIX_LEN - 1]) { 2774 + error = -EFAULT; 2775 + goto err_name; 2776 + } 2777 + 2778 + fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); 2779 + if (fd < 0) { 2780 + error = fd; 2781 + goto err_name; 2782 + } 2783 + 2784 + file = shmem_file_setup(name, 0, VM_NORESERVE); 2785 + if (IS_ERR(file)) { 2786 + error = PTR_ERR(file); 2787 + goto err_fd; 2788 + } 2789 + info = SHMEM_I(file_inode(file)); 2790 + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; 2791 + file->f_flags |= O_RDWR | O_LARGEFILE; 2792 + if (flags & MFD_ALLOW_SEALING) 2793 + info->seals &= ~F_SEAL_SEAL; 2794 + 2795 + fd_install(fd, file); 2796 + kfree(name); 2797 + return fd; 2798 + 2799 + err_fd: 2800 + put_unused_fd(fd); 2801 + err_name: 2802 + kfree(name); 2803 + return error; 2804 + } 2805 + 2737 2806 #endif /* CONFIG_TMPFS */ 2738 2807 2739 2808 static void shmem_put_super(struct super_block *sb)