Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

RCU'd vfsmounts

* RCU-delayed freeing of vfsmounts
* vfsmount_lock replaced with a seqlock (mount_lock)
* sequence number from mount_lock is stored in nameidata->m_seq and
used when we exit RCU mode
* new vfsmount flag - MNT_SYNC_UMOUNT. Set by umount_tree() when its
caller knows that vfsmount will have no surviving references.
* synchronize_rcu() done between unlocking namespace_sem in namespace_unlock()
and doing pending mntput().
* new helper: legitimize_mnt(mnt, seq). Checks the mount_lock sequence
number against seq, then grabs reference to mnt. Then it rechecks mount_lock
again to close the race and either returns success or drops the reference it
has acquired. The subtle point is that in case of MNT_SYNC_UMOUNT we can
simply decrement the refcount and sod off - aforementioned synchronize_rcu()
makes sure that final mntput() won't come until we leave RCU mode. We need
that, since we don't want to end up with some lazy pathwalk racing with
umount() and stealing the final mntput() from it - caller of umount() may
expect it to return only once the fs is shut down and we don't want to break
that. In other cases (i.e. with MNT_SYNC_UMOUNT absent) we have to do
full-blown mntput() in case of mount_lock sequence number mismatch happening
just as we'd grabbed the reference, but in those cases we won't be stealing
the final mntput() from anything that would care.
* mntput_no_expire() doesn't lock anything on the fast path now. Incidentally,
SMP and UP cases are handled the same way - no ifdefs there.
* normal pathname resolution does *not* do any writes to mount_lock. It does,
of course, bump the refcounts of vfsmount and dentry in the very end, but that's
it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Al Viro 48a066e7 42c32608

+136 -83
+14 -6
fs/dcache.c
··· 2887 2887 struct vfsmount *vfsmnt = path->mnt; 2888 2888 struct mount *mnt = real_mount(vfsmnt); 2889 2889 int error = 0; 2890 - unsigned seq = 0; 2890 + unsigned seq, m_seq = 0; 2891 2891 char *bptr; 2892 2892 int blen; 2893 2893 2894 - br_read_lock(&vfsmount_lock); 2895 2894 rcu_read_lock(); 2895 + restart_mnt: 2896 + read_seqbegin_or_lock(&mount_lock, &m_seq); 2897 + seq = 0; 2896 2898 restart: 2897 2899 bptr = *buffer; 2898 2900 blen = *buflen; 2901 + error = 0; 2899 2902 read_seqbegin_or_lock(&rename_lock, &seq); 2900 2903 while (dentry != root->dentry || vfsmnt != root->mnt) { 2901 2904 struct dentry * parent; 2902 2905 2903 2906 if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { 2907 + struct mount *parent = ACCESS_ONCE(mnt->mnt_parent); 2904 2908 /* Global root? */ 2905 - if (mnt_has_parent(mnt)) { 2906 - dentry = mnt->mnt_mountpoint; 2907 - mnt = mnt->mnt_parent; 2909 + if (mnt != parent) { 2910 + dentry = ACCESS_ONCE(mnt->mnt_mountpoint); 2911 + mnt = parent; 2908 2912 vfsmnt = &mnt->mnt; 2909 2913 continue; 2910 2914 } ··· 2942 2938 goto restart; 2943 2939 } 2944 2940 done_seqretry(&rename_lock, seq); 2945 - br_read_unlock(&vfsmount_lock); 2941 + if (need_seqretry(&mount_lock, m_seq)) { 2942 + m_seq = 1; 2943 + goto restart_mnt; 2944 + } 2945 + done_seqretry(&mount_lock, m_seq); 2946 2946 2947 2947 if (error >= 0 && bptr == *buffer) { 2948 2948 if (--blen < 0)
+6 -4
fs/mount.h
··· 1 1 #include <linux/mount.h> 2 2 #include <linux/seq_file.h> 3 3 #include <linux/poll.h> 4 - #include <linux/lglock.h> 5 4 6 5 struct mnt_namespace { 7 6 atomic_t count; ··· 29 30 struct mount *mnt_parent; 30 31 struct dentry *mnt_mountpoint; 31 32 struct vfsmount mnt; 33 + struct rcu_head mnt_rcu; 32 34 #ifdef CONFIG_SMP 33 35 struct mnt_pcp __percpu *mnt_pcp; 34 36 #else ··· 80 80 extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); 81 81 extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *); 82 82 83 + extern bool legitimize_mnt(struct vfsmount *, unsigned); 84 + 83 85 static inline void get_mnt_ns(struct mnt_namespace *ns) 84 86 { 85 87 atomic_inc(&ns->count); 86 88 } 87 89 88 - extern struct lglock vfsmount_lock; 90 + extern seqlock_t mount_lock; 89 91 90 92 static inline void lock_mount_hash(void) 91 93 { 92 - br_write_lock(&vfsmount_lock); 94 + write_seqlock(&mount_lock); 93 95 } 94 96 95 97 static inline void unlock_mount_hash(void) 96 98 { 97 - br_write_unlock(&vfsmount_lock); 99 + write_sequnlock(&mount_lock); 98 100 } 99 101 100 102 struct proc_mounts {
+26 -24
fs/namei.c
··· 484 484 485 485 static inline void lock_rcu_walk(void) 486 486 { 487 - br_read_lock(&vfsmount_lock); 488 487 rcu_read_lock(); 489 488 } 490 489 491 490 static inline void unlock_rcu_walk(void) 492 491 { 493 492 rcu_read_unlock(); 494 - br_read_unlock(&vfsmount_lock); 495 493 } 496 494 497 495 /** ··· 510 512 BUG_ON(!(nd->flags & LOOKUP_RCU)); 511 513 512 514 /* 513 - * Get a reference to the parent first: we're 514 - * going to make "path_put(nd->path)" valid in 515 - * non-RCU context for "terminate_walk()". 516 - * 517 - * If this doesn't work, return immediately with 518 - * RCU walking still active (and then we will do 519 - * the RCU walk cleanup in terminate_walk()). 515 + * After legitimizing the bastards, terminate_walk() 516 + * will do the right thing for non-RCU mode, and all our 517 + * subsequent exit cases should rcu_read_unlock() 518 + * before returning. Do vfsmount first; if dentry 519 + * can't be legitimized, just set nd->path.dentry to NULL 520 + * and rely on dput(NULL) being a no-op. 520 521 */ 521 - if (!lockref_get_not_dead(&parent->d_lockref)) 522 + if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) 522 523 return -ECHILD; 523 - 524 - /* 525 - * After the mntget(), we terminate_walk() will do 526 - * the right thing for non-RCU mode, and all our 527 - * subsequent exit cases should unlock_rcu_walk() 528 - * before returning. 529 - */ 530 - mntget(nd->path.mnt); 531 524 nd->flags &= ~LOOKUP_RCU; 525 + 526 + if (!lockref_get_not_dead(&parent->d_lockref)) { 527 + nd->path.dentry = NULL; 528 + unlock_rcu_walk(); 529 + return -ECHILD; 530 + } 532 531 533 532 /* 534 533 * For a negative lookup, the lookup sequence point is the parents ··· 603 608 if (!(nd->flags & LOOKUP_ROOT)) 604 609 nd->root.mnt = NULL; 605 610 611 + if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) { 612 + unlock_rcu_walk(); 613 + return -ECHILD; 614 + } 606 615 if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) { 607 616 unlock_rcu_walk(); 617 + mntput(nd->path.mnt); 608 618 return -ECHILD; 609 619 } 610 620 if (read_seqcount_retry(&dentry->d_seq, nd->seq)) { 611 621 unlock_rcu_walk(); 612 622 dput(dentry); 623 + mntput(nd->path.mnt); 613 624 return -ECHILD; 614 625 } 615 - mntget(nd->path.mnt); 616 626 unlock_rcu_walk(); 617 627 } 618 628 ··· 909 909 struct mount *parent; 910 910 struct dentry *mountpoint; 911 911 912 - br_read_lock(&vfsmount_lock); 912 + read_seqlock_excl(&mount_lock); 913 913 parent = mnt->mnt_parent; 914 914 if (parent == mnt) { 915 - br_read_unlock(&vfsmount_lock); 915 + read_sequnlock_excl(&mount_lock); 916 916 return 0; 917 917 } 918 918 mntget(&parent->mnt); 919 919 mountpoint = dget(mnt->mnt_mountpoint); 920 - br_read_unlock(&vfsmount_lock); 920 + read_sequnlock_excl(&mount_lock); 921 921 dput(path->dentry); 922 922 path->dentry = mountpoint; 923 923 mntput(path->mnt); ··· 1048 1048 1049 1049 /* Something is mounted on this dentry in another 1050 1050 * namespace and/or whatever was mounted there in this 1051 - * namespace got unmounted before we managed to get the 1052 - * vfsmount_lock */ 1051 + * namespace got unmounted before lookup_mnt() could 1052 + * get it */ 1053 1053 } 1054 1054 1055 1055 /* Handle an automount point */ ··· 1864 1864 if (flags & LOOKUP_RCU) { 1865 1865 lock_rcu_walk(); 1866 1866 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); 1867 + nd->m_seq = read_seqbegin(&mount_lock); 1867 1868 } else { 1868 1869 path_get(&nd->path); 1869 1870 } ··· 1873 1872 1874 1873 nd->root.mnt = NULL; 1875 1874 1875 + nd->m_seq = read_seqbegin(&mount_lock); 1876 1876 if (*name=='/') { 1877 1877 if (flags & LOOKUP_RCU) { 1878 1878 lock_rcu_walk();
+87 -48
fs/namespace.c
··· 53 53 * It should be taken for write in all cases where the vfsmount 54 54 * tree or hash is modified or when a vfsmount structure is modified. 55 55 */ 56 - DEFINE_BRLOCK(vfsmount_lock); 56 + __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); 57 57 58 58 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) 59 59 { ··· 547 547 kmem_cache_free(mnt_cache, mnt); 548 548 } 549 549 550 + /* call under rcu_read_lock */ 551 + bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) 552 + { 553 + struct mount *mnt; 554 + if (read_seqretry(&mount_lock, seq)) 555 + return false; 556 + if (bastard == NULL) 557 + return true; 558 + mnt = real_mount(bastard); 559 + mnt_add_count(mnt, 1); 560 + if (likely(!read_seqretry(&mount_lock, seq))) 561 + return true; 562 + if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { 563 + mnt_add_count(mnt, -1); 564 + return false; 565 + } 566 + rcu_read_unlock(); 567 + mntput(bastard); 568 + rcu_read_lock(); 569 + return false; 570 + } 571 + 550 572 /* 551 573 * find the first mount at @dentry on vfsmount @mnt. 552 - * vfsmount_lock must be held for read or write. 574 + * call under rcu_read_lock() 553 575 */ 554 576 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) 555 577 { 556 578 struct list_head *head = mount_hashtable + hash(mnt, dentry); 557 579 struct mount *p; 558 580 559 - list_for_each_entry(p, head, mnt_hash) 581 + list_for_each_entry_rcu(p, head, mnt_hash) 560 582 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) 561 583 return p; 562 584 return NULL; ··· 586 564 587 565 /* 588 566 * find the last mount at @dentry on vfsmount @mnt. 589 - * vfsmount_lock must be held for read or write. 567 + * mount_lock must be held. 590 568 */ 591 569 struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) 592 570 { ··· 618 596 struct vfsmount *lookup_mnt(struct path *path) 619 597 { 620 598 struct mount *child_mnt; 599 + struct vfsmount *m; 600 + unsigned seq; 621 601 622 - br_read_lock(&vfsmount_lock); 623 - child_mnt = __lookup_mnt(path->mnt, path->dentry); 624 - if (child_mnt) { 625 - mnt_add_count(child_mnt, 1); 626 - br_read_unlock(&vfsmount_lock); 627 - return &child_mnt->mnt; 628 - } else { 629 - br_read_unlock(&vfsmount_lock); 630 - return NULL; 631 - } 602 + rcu_read_lock(); 603 + do { 604 + seq = read_seqbegin(&mount_lock); 605 + child_mnt = __lookup_mnt(path->mnt, path->dentry); 606 + m = child_mnt ? &child_mnt->mnt : NULL; 607 + } while (!legitimize_mnt(m, seq)); 608 + rcu_read_unlock(); 609 + return m; 632 610 } 633 611 634 612 static struct mountpoint *new_mountpoint(struct dentry *dentry) ··· 896 874 return ERR_PTR(err); 897 875 } 898 876 877 + static void delayed_free(struct rcu_head *head) 878 + { 879 + struct mount *mnt = container_of(head, struct mount, mnt_rcu); 880 + kfree(mnt->mnt_devname); 881 + #ifdef CONFIG_SMP 882 + free_percpu(mnt->mnt_pcp); 883 + #endif 884 + kmem_cache_free(mnt_cache, mnt); 885 + } 886 + 899 887 static void mntput_no_expire(struct mount *mnt) 900 888 { 901 889 put_again: 902 - #ifdef CONFIG_SMP 903 - br_read_lock(&vfsmount_lock); 904 - if (likely(mnt->mnt_ns)) { 905 - /* shouldn't be the last one */ 906 - mnt_add_count(mnt, -1); 907 - br_read_unlock(&vfsmount_lock); 890 + rcu_read_lock(); 891 + mnt_add_count(mnt, -1); 892 + if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ 893 + rcu_read_unlock(); 908 894 return; 909 895 } 910 - br_read_unlock(&vfsmount_lock); 911 - 912 896 lock_mount_hash(); 913 - mnt_add_count(mnt, -1); 914 897 if (mnt_get_count(mnt)) { 898 + rcu_read_unlock(); 915 899 unlock_mount_hash(); 916 900 return; 917 901 } 918 - #else 919 - mnt_add_count(mnt, -1); 920 - if (likely(mnt_get_count(mnt))) 921 - return; 922 - lock_mount_hash(); 923 - #endif 924 902 if (unlikely(mnt->mnt_pinned)) { 925 903 mnt_add_count(mnt, mnt->mnt_pinned + 1); 926 904 mnt->mnt_pinned = 0; 905 + rcu_read_unlock(); 927 906 unlock_mount_hash(); 928 907 acct_auto_close_mnt(&mnt->mnt); 929 908 goto put_again; 930 909 } 910 + if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { 911 + rcu_read_unlock(); 912 + unlock_mount_hash(); 913 + return; 914 + } 915 + mnt->mnt.mnt_flags |= MNT_DOOMED; 916 + rcu_read_unlock(); 931 917 932 918 list_del(&mnt->mnt_instance); 933 919 unlock_mount_hash(); ··· 954 924 fsnotify_vfsmount_delete(&mnt->mnt); 955 925 dput(mnt->mnt.mnt_root); 956 926 deactivate_super(mnt->mnt.mnt_sb); 957 - free_vfsmnt(mnt); 927 + mnt_free_id(mnt); 928 + call_rcu(&mnt->mnt_rcu, delayed_free); 958 929 } 959 930 960 931 void mntput(struct vfsmount *mnt) ··· 1168 1137 list_splice_init(&unmounted, &head); 1169 1138 up_write(&namespace_sem); 1170 1139 1140 + synchronize_rcu(); 1141 + 1171 1142 while (!list_empty(&head)) { 1172 1143 mnt = list_first_entry(&head, struct mount, mnt_hash); 1173 1144 list_del_init(&mnt->mnt_hash); ··· 1185 1152 } 1186 1153 1187 1154 /* 1188 - * vfsmount lock must be held for write 1155 + * mount_lock must be held 1189 1156 * namespace_sem must be held for write 1157 + * how = 0 => just this tree, don't propagate 1158 + * how = 1 => propagate; we know that nobody else has reference to any victims 1159 + * how = 2 => lazy umount 1190 1160 */ 1191 - void umount_tree(struct mount *mnt, int propagate) 1161 + void umount_tree(struct mount *mnt, int how) 1192 1162 { 1193 1163 LIST_HEAD(tmp_list); 1194 1164 struct mount *p; ··· 1199 1163 for (p = mnt; p; p = next_mnt(p, mnt)) 1200 1164 list_move(&p->mnt_hash, &tmp_list); 1201 1165 1202 - if (propagate) 1166 + if (how) 1203 1167 propagate_umount(&tmp_list); 1204 1168 1205 1169 list_for_each_entry(p, &tmp_list, mnt_hash) { ··· 1207 1171 list_del_init(&p->mnt_list); 1208 1172 __touch_mnt_namespace(p->mnt_ns); 1209 1173 p->mnt_ns = NULL; 1174 + if (how < 2) 1175 + p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; 1210 1176 list_del_init(&p->mnt_child); 1211 1177 if (mnt_has_parent(p)) { 1212 1178 put_mountpoint(p->mnt_mp); ··· 1300 1262 lock_mount_hash(); 1301 1263 event++; 1302 1264 1303 - if (!(flags & MNT_DETACH)) 1304 - shrink_submounts(mnt); 1305 - 1306 - retval = -EBUSY; 1307 - if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) { 1265 + if (flags & MNT_DETACH) { 1308 1266 if (!list_empty(&mnt->mnt_list)) 1309 - umount_tree(mnt, 1); 1267 + umount_tree(mnt, 2); 1310 1268 retval = 0; 1269 + } else { 1270 + shrink_submounts(mnt); 1271 + retval = -EBUSY; 1272 + if (!propagate_mount_busy(mnt, 2)) { 1273 + if (!list_empty(&mnt->mnt_list)) 1274 + umount_tree(mnt, 1); 1275 + retval = 0; 1276 + } 1311 1277 } 1312 1278 unlock_mount_hash(); 1313 1279 namespace_unlock(); ··· 1997 1955 struct mount *parent; 1998 1956 int err; 1999 1957 2000 - mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL); 1958 + mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT); 2001 1959 2002 1960 mp = lock_mount(path); 2003 1961 if (IS_ERR(mp)) ··· 2214 2172 * process a list of expirable mountpoints with the intent of discarding any 2215 2173 * submounts of a specific parent mountpoint 2216 2174 * 2217 - * vfsmount_lock must be held for write 2175 + * mount_lock must be held for write 2218 2176 */ 2219 2177 static void shrink_submounts(struct mount *mnt) 2220 2178 { ··· 2600 2558 /* 2601 2559 * Return true if path is reachable from root 2602 2560 * 2603 - * namespace_sem or vfsmount_lock is held 2561 + * namespace_sem or mount_lock is held 2604 2562 */ 2605 2563 bool is_path_reachable(struct mount *mnt, struct dentry *dentry, 2606 2564 const struct path *root) ··· 2615 2573 int path_is_under(struct path *path1, struct path *path2) 2616 2574 { 2617 2575 int res; 2618 - br_read_lock(&vfsmount_lock); 2576 + read_seqlock_excl(&mount_lock); 2619 2577 res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); 2620 - br_read_unlock(&vfsmount_lock); 2578 + read_sequnlock_excl(&mount_lock); 2621 2579 return res; 2622 2580 } 2623 2581 EXPORT_SYMBOL(path_is_under); ··· 2790 2748 for (u = 0; u < HASH_SIZE; u++) 2791 2749 INIT_LIST_HEAD(&mountpoint_hashtable[u]); 2792 2750 2793 - br_lock_init(&vfsmount_lock); 2794 - 2795 2751 err = sysfs_init(); 2796 2752 if (err) 2797 2753 printk(KERN_WARNING "%s: sysfs_init error: %d\n", ··· 2828 2788 { 2829 2789 /* release long term mount so mount point can be released */ 2830 2790 if (!IS_ERR_OR_NULL(mnt)) { 2831 - lock_mount_hash(); 2832 2791 real_mount(mnt)->mnt_ns = NULL; 2833 - unlock_mount_hash(); 2792 + synchronize_rcu(); /* yecchhh... */ 2834 2793 mntput(mnt); 2835 2794 } 2836 2795 }
+2
include/linux/mount.h
··· 49 49 50 50 #define MNT_LOCK_READONLY 0x400000 51 51 #define MNT_LOCKED 0x800000 52 + #define MNT_DOOMED 0x1000000 53 + #define MNT_SYNC_UMOUNT 0x2000000 52 54 53 55 struct vfsmount { 54 56 struct dentry *mnt_root; /* root of the mounted tree */
+1 -1
include/linux/namei.h
··· 16 16 struct path root; 17 17 struct inode *inode; /* path.dentry.d_inode */ 18 18 unsigned int flags; 19 - unsigned seq; 19 + unsigned seq, m_seq; 20 20 int last_type; 21 21 unsigned depth; 22 22 char *saved_names[MAX_NESTED_LINKS + 1];