mounts: keep list of mounts in an rbtree

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

When adding a mount to a namespace insert it into an rbtree rooted in the
mnt_namespace instead of a linear list.

The mnt.mnt_list is still used to set up the mount tree and for
propagation, but not after the mount has been added to a namespace. Hence
mnt_list can live in union with rb_node. Use MNT_ONRB mount flag to
validate that the mount is on the correct list.

This allows removing the cursor used for reading /proc/$PID/mountinfo. The
mnt_id_unique of the next mount can be used as an index into the seq file.

Tested by inserting 100k bind mounts, unsharing the mount namespace, and
unmounting. No performance regressions have been observed.

For the last mount in the 100k list the statmount() call was more than 100x
faster due to the mount ID lookup not having to do a linear search. This
patch makes the overhead of mount ID lookup non-observable in this range.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Link: https://lore.kernel.org/r/20231025140205.3586473-3-mszeredi@redhat.com
Reviewed-by: Ian Kent <raven@themaw.net>
Signed-off-by: Christian Brauner <brauner@kernel.org>

authored by

Miklos Szeredi and committed by

Christian Brauner 2 years ago 2eea9ce4 98d2b430

+106 -118

5 changed files

expand all

mount.h

namespace.c

pnode.c

proc_namespace.c

include

linux

mount.h

+14 -10

fs/mount.h

··· 8 8 struct mnt_namespace { 9 9 struct ns_common ns; 10 10 struct mount * root; 11 - /* 12 - * Traversal and modification of .list is protected by either 13 - * - taking namespace_sem for write, OR 14 - * - taking namespace_sem for read AND taking .ns_lock. 15 - */ 16 - struct list_head list; 17 - spinlock_t ns_lock; 11 + struct rb_root mounts; /* Protected by namespace_sem */ 18 12 struct user_namespace *user_ns; 19 13 struct ucounts *ucounts; 20 14 u64 seq; /* Sequence number to prevent loops */ 21 15 wait_queue_head_t poll; 22 16 u64 event; 23 - unsigned int mounts; /* # of mounts in the namespace */ 17 + unsigned int nr_mounts; /* # of mounts in the namespace */ 24 18 unsigned int pending_mounts; 25 19 } __randomize_layout; 26 20 ··· 49 55 struct list_head mnt_child; /* and going through their mnt_child */ 50 56 struct list_head mnt_instance; /* mount instance on sb->s_mounts */ 51 57 const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ 52 - struct list_head mnt_list; 58 + union { 59 + struct rb_node mnt_node; /* Under ns->mounts */ 60 + struct list_head mnt_list; 61 + }; 53 62 struct list_head mnt_expire; /* link in fs-specific expiry list */ 54 63 struct list_head mnt_share; /* circular list of shared mounts */ 55 64 struct list_head mnt_slave_list;/* list of slave mounts */ ··· 125 128 struct mnt_namespace *ns; 126 129 struct path root; 127 130 int (*show)(struct seq_file *, struct vfsmount *); 128 - struct mount cursor; 129 131 }; 130 132 131 133 extern const struct seq_operations mounts_op; ··· 141 145 static inline bool is_anon_ns(struct mnt_namespace *ns) 142 146 { 143 147 return ns->seq == 0; 148 + } 149 + 150 + static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) 151 + { 152 + WARN_ON(!(mnt->mnt.mnt_flags & MNT_ONRB)); 153 + mnt->mnt.mnt_flags &= ~MNT_ONRB; 154 + rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts); 155 + list_add_tail(&mnt->mnt_list, dt_list); 144 156 } 145 157 146 158 extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);

+89 -101

fs/namespace.c

··· 734 734 return m; 735 735 } 736 736 737 - static inline void lock_ns_list(struct mnt_namespace *ns) 738 - { 739 - spin_lock(&ns->ns_lock); 740 - } 741 - 742 - static inline void unlock_ns_list(struct mnt_namespace *ns) 743 - { 744 - spin_unlock(&ns->ns_lock); 745 - } 746 - 747 - static inline bool mnt_is_cursor(struct mount *mnt) 748 - { 749 - return mnt->mnt.mnt_flags & MNT_CURSOR; 750 - } 751 - 752 737 /* 753 738 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the 754 739 * current mount namespace. ··· 752 767 bool __is_local_mountpoint(struct dentry *dentry) 753 768 { 754 769 struct mnt_namespace *ns = current->nsproxy->mnt_ns; 755 - struct mount *mnt; 770 + struct mount *mnt, *n; 756 771 bool is_covered = false; 757 772 758 773 down_read(&namespace_sem); 759 - lock_ns_list(ns); 760 - list_for_each_entry(mnt, &ns->list, mnt_list) { 761 - if (mnt_is_cursor(mnt)) 762 - continue; 774 + rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { 763 775 is_covered = (mnt->mnt_mountpoint == dentry); 764 776 if (is_covered) 765 777 break; 766 778 } 767 - unlock_ns_list(ns); 768 779 up_read(&namespace_sem); 769 780 770 781 return is_covered; ··· 1007 1026 mnt_add_count(old_parent, -1); 1008 1027 } 1009 1028 1029 + static inline struct mount *node_to_mount(struct rb_node *node) 1030 + { 1031 + return rb_entry(node, struct mount, mnt_node); 1032 + } 1033 + 1034 + static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) 1035 + { 1036 + struct rb_node **link = &ns->mounts.rb_node; 1037 + struct rb_node *parent = NULL; 1038 + 1039 + WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB); 1040 + mnt->mnt_ns = ns; 1041 + while (*link) { 1042 + parent = *link; 1043 + if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) 1044 + link = &parent->rb_left; 1045 + else 1046 + link = &parent->rb_right; 1047 + } 1048 + rb_link_node(&mnt->mnt_node, parent, link); 1049 + rb_insert_color(&mnt->mnt_node, &ns->mounts); 1050 + mnt->mnt.mnt_flags |= MNT_ONRB; 1051 + } 1052 + 1010 1053 /* 1011 1054 * vfsmount lock must be held for write 1012 1055 */ ··· 1044 1039 BUG_ON(parent == mnt); 1045 1040 1046 1041 list_add_tail(&head, &mnt->mnt_list); 1047 - list_for_each_entry(m, &head, mnt_list) 1048 - m->mnt_ns = n; 1042 + while (!list_empty(&head)) { 1043 + m = list_first_entry(&head, typeof(*m), mnt_list); 1044 + list_del(&m->mnt_list); 1049 1045 1050 - list_splice(&head, n->list.prev); 1051 - 1052 - n->mounts += n->pending_mounts; 1046 + mnt_add_to_ns(n, m); 1047 + } 1048 + n->nr_mounts += n->pending_mounts; 1053 1049 n->pending_mounts = 0; 1054 1050 1055 1051 __attach_mnt(mnt, parent); ··· 1198 1192 } 1199 1193 1200 1194 mnt->mnt.mnt_flags = old->mnt.mnt_flags; 1201 - mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); 1195 + mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB); 1202 1196 1203 1197 atomic_inc(&sb->s_active); 1204 1198 mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); ··· 1423 1417 return &p->mnt; 1424 1418 } 1425 1419 1426 - #ifdef CONFIG_PROC_FS 1427 - static struct mount *mnt_list_next(struct mnt_namespace *ns, 1428 - struct list_head *p) 1420 + /* 1421 + * Returns the mount which either has the specified mnt_id, or has the next 1422 + * smallest id afer the specified one. 1423 + */ 1424 + static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id) 1429 1425 { 1430 - struct mount *mnt, *ret = NULL; 1426 + struct rb_node *node = ns->mounts.rb_node; 1427 + struct mount *ret = NULL; 1431 1428 1432 - lock_ns_list(ns); 1433 - list_for_each_continue(p, &ns->list) { 1434 - mnt = list_entry(p, typeof(*mnt), mnt_list); 1435 - if (!mnt_is_cursor(mnt)) { 1436 - ret = mnt; 1437 - break; 1429 + while (node) { 1430 + struct mount *m = node_to_mount(node); 1431 + 1432 + if (mnt_id <= m->mnt_id_unique) { 1433 + ret = node_to_mount(node); 1434 + if (mnt_id == m->mnt_id_unique) 1435 + break; 1436 + node = node->rb_left; 1437 + } else { 1438 + node = node->rb_right; 1438 1439 } 1439 1440 } 1440 - unlock_ns_list(ns); 1441 - 1442 1441 return ret; 1443 1442 } 1443 + 1444 + #ifdef CONFIG_PROC_FS 1444 1445 1445 1446 /* iterator; we want it to have access to namespace_sem, thus here... */ 1446 1447 static void *m_start(struct seq_file *m, loff_t *pos) 1447 1448 { 1448 1449 struct proc_mounts *p = m->private; 1449 - struct list_head *prev; 1450 1450 1451 1451 down_read(&namespace_sem); 1452 - if (!*pos) { 1453 - prev = &p->ns->list; 1454 - } else { 1455 - prev = &p->cursor.mnt_list; 1456 1452 1457 - /* Read after we'd reached the end? */ 1458 - if (list_empty(prev)) 1459 - return NULL; 1460 - } 1461 - 1462 - return mnt_list_next(p->ns, prev); 1453 + return mnt_find_id_at(p->ns, *pos); 1463 1454 } 1464 1455 1465 1456 static void *m_next(struct seq_file *m, void *v, loff_t *pos) 1466 1457 { 1467 - struct proc_mounts *p = m->private; 1468 - struct mount *mnt = v; 1458 + struct mount *next = NULL, *mnt = v; 1459 + struct rb_node *node = rb_next(&mnt->mnt_node); 1469 1460 1470 1461 ++*pos; 1471 - return mnt_list_next(p->ns, &mnt->mnt_list); 1462 + if (node) { 1463 + next = node_to_mount(node); 1464 + *pos = next->mnt_id_unique; 1465 + } 1466 + return next; 1472 1467 } 1473 1468 1474 1469 static void m_stop(struct seq_file *m, void *v) 1475 1470 { 1476 - struct proc_mounts *p = m->private; 1477 - struct mount *mnt = v; 1478 - 1479 - lock_ns_list(p->ns); 1480 - if (mnt) 1481 - list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list); 1482 - else 1483 - list_del_init(&p->cursor.mnt_list); 1484 - unlock_ns_list(p->ns); 1485 1471 up_read(&namespace_sem); 1486 1472 } 1487 1473 ··· 1491 1493 .show = m_show, 1492 1494 }; 1493 1495 1494 - void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor) 1495 - { 1496 - down_read(&namespace_sem); 1497 - lock_ns_list(ns); 1498 - list_del(&cursor->mnt_list); 1499 - unlock_ns_list(ns); 1500 - up_read(&namespace_sem); 1501 - } 1502 1496 #endif /* CONFIG_PROC_FS */ 1503 1497 1504 1498 /** ··· 1632 1642 /* Gather the mounts to umount */ 1633 1643 for (p = mnt; p; p = next_mnt(p, mnt)) { 1634 1644 p->mnt.mnt_flags |= MNT_UMOUNT; 1635 - list_move(&p->mnt_list, &tmp_list); 1645 + if (p->mnt.mnt_flags & MNT_ONRB) 1646 + move_from_ns(p, &tmp_list); 1647 + else 1648 + list_move(&p->mnt_list, &tmp_list); 1636 1649 } 1637 1650 1638 1651 /* Hide the mounts from mnt_mounts */ ··· 1655 1662 list_del_init(&p->mnt_list); 1656 1663 ns = p->mnt_ns; 1657 1664 if (ns) { 1658 - ns->mounts--; 1665 + ns->nr_mounts--; 1659 1666 __touch_mnt_namespace(ns); 1660 1667 } 1661 1668 p->mnt_ns = NULL; ··· 1781 1788 1782 1789 event++; 1783 1790 if (flags & MNT_DETACH) { 1784 - if (!list_empty(&mnt->mnt_list)) 1791 + if (mnt->mnt.mnt_flags & MNT_ONRB || 1792 + !list_empty(&mnt->mnt_list)) 1785 1793 umount_tree(mnt, UMOUNT_PROPAGATE); 1786 1794 retval = 0; 1787 1795 } else { 1788 1796 shrink_submounts(mnt); 1789 1797 retval = -EBUSY; 1790 1798 if (!propagate_mount_busy(mnt, 2)) { 1791 - if (!list_empty(&mnt->mnt_list)) 1799 + if (mnt->mnt.mnt_flags & MNT_ONRB || 1800 + !list_empty(&mnt->mnt_list)) 1792 1801 umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); 1793 1802 retval = 0; 1794 1803 } ··· 2208 2213 unsigned int mounts = 0; 2209 2214 struct mount *p; 2210 2215 2211 - if (ns->mounts >= max) 2216 + if (ns->nr_mounts >= max) 2212 2217 return -ENOSPC; 2213 - max -= ns->mounts; 2218 + max -= ns->nr_mounts; 2214 2219 if (ns->pending_mounts >= max) 2215 2220 return -ENOSPC; 2216 2221 max -= ns->pending_mounts; ··· 2354 2359 touch_mnt_namespace(source_mnt->mnt_ns); 2355 2360 } else { 2356 2361 if (source_mnt->mnt_ns) { 2362 + LIST_HEAD(head); 2363 + 2357 2364 /* move from anon - the caller will destroy */ 2358 - list_del_init(&source_mnt->mnt_ns->list); 2365 + for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 2366 + move_from_ns(p, &head); 2367 + list_del_init(&head); 2359 2368 } 2360 2369 if (beneath) 2361 2370 mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp); ··· 2670 2671 2671 2672 lock_mount_hash(); 2672 2673 for (p = mnt; p; p = next_mnt(p, mnt)) { 2673 - p->mnt_ns = ns; 2674 - ns->mounts++; 2674 + mnt_add_to_ns(ns, p); 2675 + ns->nr_mounts++; 2675 2676 } 2676 2677 ns->root = mnt; 2677 - list_add_tail(&ns->list, &mnt->mnt_list); 2678 2678 mntget(&mnt->mnt); 2679 2679 unlock_mount_hash(); 2680 2680 namespace_unlock(); ··· 3736 3738 if (!anon) 3737 3739 new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); 3738 3740 refcount_set(&new_ns->ns.count, 1); 3739 - INIT_LIST_HEAD(&new_ns->list); 3741 + new_ns->mounts = RB_ROOT; 3740 3742 init_waitqueue_head(&new_ns->poll); 3741 - spin_lock_init(&new_ns->ns_lock); 3742 3743 new_ns->user_ns = get_user_ns(user_ns); 3743 3744 new_ns->ucounts = ucounts; 3744 3745 return new_ns; ··· 3784 3787 unlock_mount_hash(); 3785 3788 } 3786 3789 new_ns->root = new; 3787 - list_add_tail(&new_ns->list, &new->mnt_list); 3788 3790 3789 3791 /* 3790 3792 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts ··· 3793 3797 p = old; 3794 3798 q = new; 3795 3799 while (p) { 3796 - q->mnt_ns = new_ns; 3797 - new_ns->mounts++; 3800 + mnt_add_to_ns(new_ns, q); 3801 + new_ns->nr_mounts++; 3798 3802 if (new_fs) { 3799 3803 if (&p->mnt == new_fs->root.mnt) { 3800 3804 new_fs->root.mnt = mntget(&q->mnt); ··· 3836 3840 mntput(m); 3837 3841 return ERR_CAST(ns); 3838 3842 } 3839 - mnt->mnt_ns = ns; 3840 3843 ns->root = mnt; 3841 - ns->mounts++; 3842 - list_add(&mnt->mnt_list, &ns->list); 3844 + ns->nr_mounts++; 3845 + mnt_add_to_ns(ns, mnt); 3843 3846 3844 3847 err = vfs_path_lookup(m->mnt_root, m, 3845 3848 name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); ··· 4016 4021 goto err_path; 4017 4022 } 4018 4023 mnt = real_mount(newmount.mnt); 4019 - mnt->mnt_ns = ns; 4020 4024 ns->root = mnt; 4021 - ns->mounts = 1; 4022 - list_add(&mnt->mnt_list, &ns->list); 4025 + ns->nr_mounts = 1; 4026 + mnt_add_to_ns(ns, mnt); 4023 4027 mntget(newmount.mnt); 4024 4028 4025 4029 /* Attach to an apparent O_PATH fd with a note that we need to unmount ··· 4689 4695 if (IS_ERR(ns)) 4690 4696 panic("Can't allocate initial namespace"); 4691 4697 m = real_mount(mnt); 4692 - m->mnt_ns = ns; 4693 4698 ns->root = m; 4694 - ns->mounts = 1; 4695 - list_add(&m->mnt_list, &ns->list); 4699 + ns->nr_mounts = 1; 4700 + mnt_add_to_ns(ns, m); 4696 4701 init_task.nsproxy->mnt_ns = ns; 4697 4702 get_mnt_ns(ns); 4698 4703 ··· 4818 4825 int *new_mnt_flags) 4819 4826 { 4820 4827 int new_flags = *new_mnt_flags; 4821 - struct mount *mnt; 4828 + struct mount *mnt, *n; 4822 4829 bool visible = false; 4823 4830 4824 4831 down_read(&namespace_sem); 4825 - lock_ns_list(ns); 4826 - list_for_each_entry(mnt, &ns->list, mnt_list) { 4832 + rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { 4827 4833 struct mount *child; 4828 4834 int mnt_flags; 4829 - 4830 - if (mnt_is_cursor(mnt)) 4831 - continue; 4832 4835 4833 4836 if (mnt->mnt.mnt_sb->s_type != sb->s_type) 4834 4837 continue; ··· 4873 4884 next: ; 4874 4885 } 4875 4886 found: 4876 - unlock_ns_list(ns); 4877 4887 up_read(&namespace_sem); 4878 4888 return visible; 4879 4889 }

+1 -1

fs/pnode.c

··· 468 468 mnt->mnt.mnt_flags |= MNT_UMOUNT; 469 469 list_del_init(&mnt->mnt_child); 470 470 list_del_init(&mnt->mnt_umounting); 471 - list_move_tail(&mnt->mnt_list, to_umount); 471 + move_from_ns(mnt, to_umount); 472 472 } 473 473 474 474 /*

-3

fs/proc_namespace.c

··· 283 283 p->ns = ns; 284 284 p->root = root; 285 285 p->show = show; 286 - INIT_LIST_HEAD(&p->cursor.mnt_list); 287 - p->cursor.mnt.mnt_flags = MNT_CURSOR; 288 286 289 287 return 0; 290 288 ··· 299 301 struct seq_file *m = file->private_data; 300 302 struct proc_mounts *p = m->private; 301 303 path_put(&p->root); 302 - mnt_cursor_del(p->ns, &p->cursor); 303 304 put_mnt_ns(p->ns); 304 305 return seq_release_private(inode, file); 305 306 }

+2 -3

include/linux/mount.h

··· 50 50 #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) 51 51 52 52 #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ 53 - MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \ 54 - MNT_CURSOR) 53 + MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | MNT_ONRB) 55 54 56 55 #define MNT_INTERNAL 0x4000 57 56 ··· 64 65 #define MNT_SYNC_UMOUNT 0x2000000 65 66 #define MNT_MARKED 0x4000000 66 67 #define MNT_UMOUNT 0x8000000 67 - #define MNT_CURSOR 0x10000000 68 + #define MNT_ONRB 0x10000000 68 69 69 70 struct vfsmount { 70 71 struct dentry *mnt_root; /* root of the mounted tree */