Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

fs: add immutable rootfs

Currently pivot_root() doesn't work on the real rootfs because it
cannot be unmounted. Userspace has to do a recursive removal of the
initramfs contents manually before continuing the boot.

Really all we want from the real rootfs is to serve as the parent mount
for anything that is actually useful such as the tmpfs or ramfs for
initramfs unpacking or the rootfs itself. There's no need for the real
rootfs to actually be anything meaningful or useful. Add a immutable
rootfs called "nullfs" that can be selected via the "nullfs_rootfs"
kernel command line option.

The kernel will mount a tmpfs/ramfs on top of it, unpack the initramfs
and fire up userspace which mounts the rootfs and can then just do:

chdir(rootfs);
pivot_root(".", ".");
umount2(".", MNT_DETACH);

and be done with it. (Ofc, userspace can also choose to retain the
initramfs contents by using something like pivot_root(".", "/initramfs")
without unmounting it.)

Technically this also means that the rootfs mount in unprivileged
namespaces doesn't need to become MNT_LOCKED anymore as it's guaranteed
that the immutable rootfs remains permanently empty so there cannot be
anything revealed by unmounting the covering mount.

In the future this will also allow us to create completely empty mount
namespaces without risking to leak anything.

systemd already handles this all correctly as it tries to pivot_root()
first and falls back to MS_MOVE only when that fails.

This goes back to various discussion in previous years and a LPC 2024
presentation about this very topic.

Link: https://patch.msgid.link/20260112-work-immutable-rootfs-v2-3-88dd1c34a204@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>

+159 -12
+1 -1
fs/Makefile
··· 16 16 stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ 17 17 fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \ 18 18 kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \ 19 - file_attr.o 19 + file_attr.o nullfs.o 20 20 21 21 obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o 22 22 obj-$(CONFIG_PROC_FS) += proc_namespace.o
+1
fs/mount.h
··· 5 5 #include <linux/ns_common.h> 6 6 #include <linux/fs_pin.h> 7 7 8 + extern struct file_system_type nullfs_fs_type; 8 9 extern struct list_head notify_list; 9 10 10 11 struct mnt_namespace {
+71 -11
fs/namespace.c
··· 75 75 76 76 __setup("initramfs_options=", initramfs_options_setup); 77 77 78 + bool nullfs_rootfs = false; 79 + 80 + static int __init nullfs_rootfs_setup(char *str) 81 + { 82 + if (*str) 83 + return 0; 84 + nullfs_rootfs = true; 85 + return 1; 86 + } 87 + __setup("nullfs_rootfs", nullfs_rootfs_setup); 88 + 78 89 static u64 event; 79 90 static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC); 80 91 static DEFINE_IDA(mnt_group_ida); ··· 4593 4582 * pointed to by put_old must yield the same directory as new_root. No other 4594 4583 * file system may be mounted on put_old. After all, new_root is a mountpoint. 4595 4584 * 4596 - * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. 4597 - * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives 4585 + * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem 4586 + * unless the kernel was booted with "nullfs_rootfs". See 4587 + * Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives 4598 4588 * in this situation. 4599 4589 * 4600 4590 * Notes: ··· 5988 5976 5989 5977 static void __init init_mount_tree(void) 5990 5978 { 5991 - struct vfsmount *mnt; 5992 - struct mount *m; 5979 + struct vfsmount *mnt, *nullfs_mnt; 5980 + struct mount *mnt_root; 5993 5981 struct path root; 5982 + 5983 + /* 5984 + * When nullfs is used, we create two mounts: 5985 + * 5986 + * (1) nullfs with mount id 1 5987 + * (2) mutable rootfs with mount id 2 5988 + * 5989 + * with (2) mounted on top of (1). 5990 + */ 5991 + if (nullfs_rootfs) { 5992 + nullfs_mnt = vfs_kern_mount(&nullfs_fs_type, 0, "nullfs", NULL); 5993 + if (IS_ERR(nullfs_mnt)) 5994 + panic("VFS: Failed to create nullfs"); 5995 + } 5994 5996 5995 5997 mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); 5996 5998 if (IS_ERR(mnt)) 5997 5999 panic("Can't create rootfs"); 5998 6000 5999 - m = real_mount(mnt); 6000 - init_mnt_ns.root = m; 6001 - init_mnt_ns.nr_mounts = 1; 6002 - mnt_add_to_ns(&init_mnt_ns, m); 6001 + if (nullfs_rootfs) { 6002 + VFS_WARN_ON_ONCE(real_mount(nullfs_mnt)->mnt_id != 1); 6003 + VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 2); 6004 + 6005 + /* The namespace root is the nullfs mnt. */ 6006 + mnt_root = real_mount(nullfs_mnt); 6007 + init_mnt_ns.root = mnt_root; 6008 + 6009 + /* Mount mutable rootfs on top of nullfs. */ 6010 + root.mnt = nullfs_mnt; 6011 + root.dentry = nullfs_mnt->mnt_root; 6012 + 6013 + LOCK_MOUNT_EXACT(mp, &root); 6014 + if (unlikely(IS_ERR(mp.parent))) 6015 + panic("VFS: Failed to mount rootfs on nullfs"); 6016 + scoped_guard(mount_writer) 6017 + attach_mnt(real_mount(mnt), mp.parent, mp.mp); 6018 + 6019 + pr_info("VFS: Finished mounting rootfs on nullfs\n"); 6020 + } else { 6021 + VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 1); 6022 + 6023 + /* The namespace root is the mutable rootfs. */ 6024 + mnt_root = real_mount(mnt); 6025 + init_mnt_ns.root = mnt_root; 6026 + } 6027 + 6028 + /* 6029 + * We've dropped all locks here but that's fine. Not just are we 6030 + * the only task that's running, there's no other mount 6031 + * namespace in existence and the initial mount namespace is 6032 + * completely empty until we add the mounts we just created. 6033 + */ 6034 + for (struct mount *p = mnt_root; p; p = next_mnt(p, mnt_root)) { 6035 + mnt_add_to_ns(&init_mnt_ns, p); 6036 + init_mnt_ns.nr_mounts++; 6037 + } 6038 + 6003 6039 init_task.nsproxy->mnt_ns = &init_mnt_ns; 6004 6040 get_mnt_ns(&init_mnt_ns); 6005 6041 6006 - root.mnt = mnt; 6007 - root.dentry = mnt->mnt_root; 6008 - 6042 + /* The root and pwd always point to the mutable rootfs. */ 6043 + root.mnt = mnt; 6044 + root.dentry = mnt->mnt_root; 6009 6045 set_fs_pwd(current->fs, &root); 6010 6046 set_fs_root(current->fs, &root); 6011 6047
+70
fs/nullfs.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */ 3 + #include <linux/fs/super_types.h> 4 + #include <linux/fs_context.h> 5 + #include <linux/magic.h> 6 + 7 + static const struct super_operations nullfs_super_operations = { 8 + .statfs = simple_statfs, 9 + }; 10 + 11 + static int nullfs_fs_fill_super(struct super_block *s, struct fs_context *fc) 12 + { 13 + struct inode *inode; 14 + 15 + s->s_maxbytes = MAX_LFS_FILESIZE; 16 + s->s_blocksize = PAGE_SIZE; 17 + s->s_blocksize_bits = PAGE_SHIFT; 18 + s->s_magic = NULL_FS_MAGIC; 19 + s->s_op = &nullfs_super_operations; 20 + s->s_export_op = NULL; 21 + s->s_xattr = NULL; 22 + s->s_time_gran = 1; 23 + s->s_d_flags = 0; 24 + 25 + inode = new_inode(s); 26 + if (!inode) 27 + return -ENOMEM; 28 + 29 + /* nullfs is permanently empty... */ 30 + make_empty_dir_inode(inode); 31 + simple_inode_init_ts(inode); 32 + inode->i_ino = 1; 33 + /* ... and immutable. */ 34 + inode->i_flags |= S_IMMUTABLE; 35 + 36 + s->s_root = d_make_root(inode); 37 + if (!s->s_root) 38 + return -ENOMEM; 39 + 40 + return 0; 41 + } 42 + 43 + /* 44 + * For now this is a single global instance. If needed we can make it 45 + * mountable by userspace at which point we will need to make it 46 + * multi-instance. 47 + */ 48 + static int nullfs_fs_get_tree(struct fs_context *fc) 49 + { 50 + return get_tree_single(fc, nullfs_fs_fill_super); 51 + } 52 + 53 + static const struct fs_context_operations nullfs_fs_context_ops = { 54 + .get_tree = nullfs_fs_get_tree, 55 + }; 56 + 57 + static int nullfs_init_fs_context(struct fs_context *fc) 58 + { 59 + fc->ops = &nullfs_fs_context_ops; 60 + fc->global = true; 61 + fc->sb_flags = SB_NOUSER; 62 + fc->s_iflags = SB_I_NOEXEC | SB_I_NODEV; 63 + return 0; 64 + } 65 + 66 + struct file_system_type nullfs_fs_type = { 67 + .name = "nullfs", 68 + .init_fs_context = nullfs_init_fs_context, 69 + .kill_sb = kill_anon_super, 70 + };
+1
include/uapi/linux/magic.h
··· 104 104 #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ 105 105 #define PID_FS_MAGIC 0x50494446 /* "PIDF" */ 106 106 #define GUEST_MEMFD_MAGIC 0x474d454d /* "GMEM" */ 107 + #define NULL_FS_MAGIC 0x4E554C4C /* "NULL" */ 107 108 108 109 #endif /* __LINUX_MAGIC_H__ */
+14
init/do_mounts.c
··· 492 492 mount_root(saved_root_name); 493 493 out: 494 494 devtmpfs_mount(); 495 + 496 + if (nullfs_rootfs) { 497 + if (init_pivot_root(".", ".")) { 498 + pr_err("VFS: Failed to pivot into new rootfs\n"); 499 + return; 500 + } 501 + if (init_umount(".", MNT_DETACH)) { 502 + pr_err("VFS: Failed to unmount old rootfs\n"); 503 + return; 504 + } 505 + pr_info("VFS: Pivoted into new rootfs\n"); 506 + return; 507 + } 508 + 495 509 init_mount(".", "/", NULL, MS_MOVE, NULL); 496 510 init_chroot("."); 497 511 }
+1
init/do_mounts.h
··· 15 15 void mount_root_generic(char *name, char *pretty_name, int flags); 16 16 void mount_root(char *root_device_name); 17 17 extern int root_mountflags; 18 + extern bool nullfs_rootfs; 18 19 19 20 static inline __init int create_dev(char *name, dev_t dev) 20 21 {