Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

super: wait for nascent superblocks

Recent patches experiment with making it possible to allocate a new
superblock before opening the relevant block device. Naturally this has
intricate side-effects that we get to learn about while developing this.

Superblock allocators such as sget{_fc}() return with s_umount of the
new superblock held and lock ordering currently requires that block
level locks such as bdev_lock and open_mutex rank above s_umount.

Before aca740cecbe5 ("fs: open block device after superblock creation")
ordering was guaranteed to be correct as block devices were opened prior
to superblock allocation and thus s_umount wasn't held. But now s_umount
must be dropped before opening block devices to avoid locking
violations.

This has consequences. The main one being that iterators over
@super_blocks and @fs_supers that grab a temporary reference to the
superblock can now also grab s_umount before the caller has managed to
open block devices and called fill_super(). So whereas before such
iterators or concurrent mounts would have simply slept on s_umount until
SB_BORN was set or the superblock was discard due to initalization
failure they can now needlessly spin through sget{_fc}().

If the caller is sleeping on bdev_lock or open_mutex one caller waiting
on SB_BORN will always spin somewhere and potentially this can go on for
quite a while.

It should be possible to drop s_umount while allowing iterators to wait
on a nascent superblock to either be born or discarded. This patch
implements a wait_var_event() mechanism allowing iterators to sleep
until they are woken when the superblock is born or discarded.

This also allows us to avoid relooping through @fs_supers and
@super_blocks if a superblock isn't yet born or dying.

Link: aca740cecbe5 ("fs: open block device after superblock creation")
Reviewed-by: Jan Kara <jack@suse.cz>
Message-Id: <20230818-vfs-super-fixes-v3-v3-3-9f0b1876e46b@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>

+154 -51
+153 -51
fs/super.c
··· 50 50 "sb_internal", 51 51 }; 52 52 53 - static inline void super_lock(struct super_block *sb, bool excl) 53 + static inline void __super_lock(struct super_block *sb, bool excl) 54 54 { 55 55 if (excl) 56 56 down_write(&sb->s_umount); ··· 66 66 up_read(&sb->s_umount); 67 67 } 68 68 69 - static inline void super_lock_excl(struct super_block *sb) 69 + static inline void __super_lock_excl(struct super_block *sb) 70 70 { 71 - super_lock(sb, true); 72 - } 73 - 74 - static inline void super_lock_shared(struct super_block *sb) 75 - { 76 - super_lock(sb, false); 71 + __super_lock(sb, true); 77 72 } 78 73 79 74 static inline void super_unlock_excl(struct super_block *sb) ··· 79 84 static inline void super_unlock_shared(struct super_block *sb) 80 85 { 81 86 super_unlock(sb, false); 87 + } 88 + 89 + static inline bool wait_born(struct super_block *sb) 90 + { 91 + unsigned int flags; 92 + 93 + /* 94 + * Pairs with smp_store_release() in super_wake() and ensures 95 + * that we see SB_BORN or SB_DYING after we're woken. 96 + */ 97 + flags = smp_load_acquire(&sb->s_flags); 98 + return flags & (SB_BORN | SB_DYING); 99 + } 100 + 101 + /** 102 + * super_lock - wait for superblock to become ready and lock it 103 + * @sb: superblock to wait for 104 + * @excl: whether exclusive access is required 105 + * 106 + * If the superblock has neither passed through vfs_get_tree() or 107 + * generic_shutdown_super() yet wait for it to happen. Either superblock 108 + * creation will succeed and SB_BORN is set by vfs_get_tree() or we're 109 + * woken and we'll see SB_DYING. 110 + * 111 + * The caller must have acquired a temporary reference on @sb->s_count. 112 + * 113 + * Return: This returns true if SB_BORN was set, false if SB_DYING was 114 + * set. The function acquires s_umount and returns with it held. 115 + */ 116 + static __must_check bool super_lock(struct super_block *sb, bool excl) 117 + { 118 + 119 + lockdep_assert_not_held(&sb->s_umount); 120 + 121 + relock: 122 + __super_lock(sb, excl); 123 + 124 + /* 125 + * Has gone through generic_shutdown_super() in the meantime. 126 + * @sb->s_root is NULL and @sb->s_active is 0. No one needs to 127 + * grab a reference to this. Tell them so. 128 + */ 129 + if (sb->s_flags & SB_DYING) 130 + return false; 131 + 132 + /* Has called ->get_tree() successfully. */ 133 + if (sb->s_flags & SB_BORN) 134 + return true; 135 + 136 + super_unlock(sb, excl); 137 + 138 + /* wait until the superblock is ready or dying */ 139 + wait_var_event(&sb->s_flags, wait_born(sb)); 140 + 141 + /* 142 + * Neither SB_BORN nor SB_DYING are ever unset so we never loop. 143 + * Just reacquire @sb->s_umount for the caller. 144 + */ 145 + goto relock; 146 + } 147 + 148 + /* wait and acquire read-side of @sb->s_umount */ 149 + static inline bool super_lock_shared(struct super_block *sb) 150 + { 151 + return super_lock(sb, false); 152 + } 153 + 154 + /* wait and acquire write-side of @sb->s_umount */ 155 + static inline bool super_lock_excl(struct super_block *sb) 156 + { 157 + return super_lock(sb, true); 158 + } 159 + 160 + /* wake waiters */ 161 + #define SUPER_WAKE_FLAGS (SB_BORN | SB_DYING) 162 + static void super_wake(struct super_block *sb, unsigned int flag) 163 + { 164 + WARN_ON_ONCE((flag & ~SUPER_WAKE_FLAGS)); 165 + WARN_ON_ONCE(hweight32(flag & SUPER_WAKE_FLAGS) > 1); 166 + 167 + /* 168 + * Pairs with smp_load_acquire() in super_lock() to make sure 169 + * all initializations in the superblock are seen by the user 170 + * seeing SB_BORN sent. 171 + */ 172 + smp_store_release(&sb->s_flags, sb->s_flags | flag); 173 + /* 174 + * Pairs with the barrier in prepare_to_wait_event() to make sure 175 + * ___wait_var_event() either sees SB_BORN set or 176 + * waitqueue_active() check in wake_up_var() sees the waiter. 177 + */ 178 + smp_mb(); 179 + wake_up_var(&sb->s_flags); 82 180 } 83 181 84 182 /* ··· 481 393 void deactivate_super(struct super_block *s) 482 394 { 483 395 if (!atomic_add_unless(&s->s_active, -1, 1)) { 484 - super_lock_excl(s); 396 + __super_lock_excl(s); 485 397 deactivate_locked_super(s); 486 398 } 487 399 } ··· 503 415 */ 504 416 static int grab_super(struct super_block *s) __releases(sb_lock) 505 417 { 418 + bool born; 419 + 506 420 s->s_count++; 507 421 spin_unlock(&sb_lock); 508 - super_lock_excl(s); 509 - if ((s->s_flags & SB_BORN) && atomic_inc_not_zero(&s->s_active)) { 422 + born = super_lock_excl(s); 423 + if (born && atomic_inc_not_zero(&s->s_active)) { 510 424 put_super(s); 511 425 return 1; 512 426 } ··· 537 447 bool super_trylock_shared(struct super_block *sb) 538 448 { 539 449 if (down_read_trylock(&sb->s_umount)) { 540 - if (!hlist_unhashed(&sb->s_instances) && 541 - sb->s_root && (sb->s_flags & SB_BORN)) 450 + if (!(sb->s_flags & SB_DYING) && sb->s_root && 451 + (sb->s_flags & SB_BORN)) 542 452 return true; 543 453 super_unlock_shared(sb); 544 454 } ··· 565 475 void retire_super(struct super_block *sb) 566 476 { 567 477 WARN_ON(!sb->s_bdev); 568 - super_lock_excl(sb); 478 + __super_lock_excl(sb); 569 479 if (sb->s_iflags & SB_I_PERSB_BDI) { 570 480 bdi_unregister(sb->s_bdi); 571 481 sb->s_iflags &= ~SB_I_PERSB_BDI; ··· 647 557 /* should be initialized for __put_super_and_need_restart() */ 648 558 hlist_del_init(&sb->s_instances); 649 559 spin_unlock(&sb_lock); 560 + /* 561 + * Broadcast to everyone that grabbed a temporary reference to this 562 + * superblock before we removed it from @fs_supers that the superblock 563 + * is dying. Every walker of @fs_supers outside of sget{_fc}() will now 564 + * discard this superblock and treat it as dead. 565 + */ 566 + super_wake(sb, SB_DYING); 650 567 super_unlock_excl(sb); 651 568 if (sb->s_bdi != &noop_backing_dev_info) { 652 569 if (sb->s_iflags & SB_I_PERSB_BDI) ··· 728 631 s->s_type = fc->fs_type; 729 632 s->s_iflags |= fc->s_iflags; 730 633 strscpy(s->s_id, s->s_type->name, sizeof(s->s_id)); 634 + /* 635 + * Make the superblock visible on @super_blocks and @fs_supers. 636 + * It's in a nascent state and users should wait on SB_BORN or 637 + * SB_DYING to be set. 638 + */ 731 639 list_add_tail(&s->s_list, &super_blocks); 732 640 hlist_add_head(&s->s_instances, &s->s_type->fs_supers); 733 641 spin_unlock(&sb_lock); ··· 842 740 843 741 spin_lock(&sb_lock); 844 742 list_for_each_entry(sb, &super_blocks, s_list) { 845 - if (hlist_unhashed(&sb->s_instances)) 743 + /* Pairs with memory marrier in super_wake(). */ 744 + if (smp_load_acquire(&sb->s_flags) & SB_DYING) 846 745 continue; 847 746 sb->s_count++; 848 747 spin_unlock(&sb_lock); ··· 873 770 874 771 spin_lock(&sb_lock); 875 772 list_for_each_entry(sb, &super_blocks, s_list) { 876 - if (hlist_unhashed(&sb->s_instances)) 877 - continue; 773 + bool born; 774 + 878 775 sb->s_count++; 879 776 spin_unlock(&sb_lock); 880 777 881 - super_lock_shared(sb); 882 - if (sb->s_root && (sb->s_flags & SB_BORN)) 778 + born = super_lock_shared(sb); 779 + if (born && sb->s_root) 883 780 f(sb, arg); 884 781 super_unlock_shared(sb); 885 782 ··· 909 806 910 807 spin_lock(&sb_lock); 911 808 hlist_for_each_entry(sb, &type->fs_supers, s_instances) { 809 + bool born; 810 + 912 811 sb->s_count++; 913 812 spin_unlock(&sb_lock); 914 813 915 - super_lock_shared(sb); 916 - if (sb->s_root && (sb->s_flags & SB_BORN)) 814 + born = super_lock_shared(sb); 815 + if (born && sb->s_root) 917 816 f(sb, arg); 918 817 super_unlock_shared(sb); 919 818 ··· 946 841 if (!bdev) 947 842 return NULL; 948 843 949 - restart: 950 844 spin_lock(&sb_lock); 951 845 list_for_each_entry(sb, &super_blocks, s_list) { 952 - if (hlist_unhashed(&sb->s_instances)) 953 - continue; 954 846 if (sb->s_bdev == bdev) { 955 847 if (!grab_super(sb)) 956 - goto restart; 848 + return NULL; 957 849 super_unlock_excl(sb); 958 850 return sb; 959 851 } ··· 964 862 struct super_block *sb; 965 863 966 864 spin_lock(&sb_lock); 967 - rescan: 968 865 list_for_each_entry(sb, &super_blocks, s_list) { 969 - if (hlist_unhashed(&sb->s_instances)) 970 - continue; 971 866 if (sb->s_dev == dev) { 867 + bool born; 868 + 972 869 sb->s_count++; 973 870 spin_unlock(&sb_lock); 974 - super_lock(sb, excl); 975 871 /* still alive? */ 976 - if (sb->s_root && (sb->s_flags & SB_BORN)) 872 + born = super_lock(sb, excl); 873 + if (born && sb->s_root) 977 874 return sb; 978 875 super_unlock(sb, excl); 979 876 /* nope, got unmounted */ 980 877 spin_lock(&sb_lock); 981 878 __put_super(sb); 982 - goto rescan; 879 + break; 983 880 } 984 881 } 985 882 spin_unlock(&sb_lock); ··· 1022 921 if (!hlist_empty(&sb->s_pins)) { 1023 922 super_unlock_excl(sb); 1024 923 group_pin_kill(&sb->s_pins); 1025 - super_lock_excl(sb); 924 + __super_lock_excl(sb); 1026 925 if (!sb->s_root) 1027 926 return 0; 1028 927 if (sb->s_writers.frozen != SB_UNFROZEN) ··· 1085 984 1086 985 static void do_emergency_remount_callback(struct super_block *sb) 1087 986 { 1088 - super_lock_excl(sb); 1089 - if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) && 1090 - !sb_rdonly(sb)) { 987 + bool born = super_lock_excl(sb); 988 + 989 + if (born && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) { 1091 990 struct fs_context *fc; 1092 991 1093 992 fc = fs_context_for_reconfigure(sb->s_root, ··· 1121 1020 1122 1021 static void do_thaw_all_callback(struct super_block *sb) 1123 1022 { 1124 - super_lock_excl(sb); 1125 - if (sb->s_root && sb->s_flags & SB_BORN) { 1023 + bool born = super_lock_excl(sb); 1024 + 1025 + if (born && sb->s_root) { 1126 1026 emergency_thaw_bdev(sb); 1127 1027 thaw_super_locked(sb); 1128 1028 } else { ··· 1314 1212 */ 1315 1213 static bool super_lock_shared_active(struct super_block *sb) 1316 1214 { 1317 - super_lock_shared(sb); 1318 - if (!sb->s_root || 1319 - (sb->s_flags & (SB_ACTIVE | SB_BORN)) != (SB_ACTIVE | SB_BORN)) { 1215 + bool born = super_lock_shared(sb); 1216 + 1217 + if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) { 1320 1218 super_unlock_shared(sb); 1321 1219 return false; 1322 1220 } ··· 1476 1374 */ 1477 1375 super_unlock_excl(s); 1478 1376 error = setup_bdev_super(s, fc->sb_flags, fc); 1479 - super_lock_excl(s); 1377 + __super_lock_excl(s); 1480 1378 if (!error) 1481 1379 error = fill_super(s, fc); 1482 1380 if (error) { ··· 1528 1426 */ 1529 1427 super_unlock_excl(s); 1530 1428 error = setup_bdev_super(s, flags, NULL); 1531 - super_lock_excl(s); 1429 + __super_lock_excl(s); 1532 1430 if (!error) 1533 1431 error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); 1534 1432 if (error) { ··· 1668 1566 WARN_ON(!sb->s_bdi); 1669 1567 1670 1568 /* 1671 - * Write barrier is for super_cache_count(). We place it before setting 1672 - * SB_BORN as the data dependency between the two functions is the 1673 - * superblock structure contents that we just set up, not the SB_BORN 1674 - * flag. 1569 + * super_wake() contains a memory barrier which also care of 1570 + * ordering for super_cache_count(). We place it before setting 1571 + * SB_BORN as the data dependency between the two functions is 1572 + * the superblock structure contents that we just set up, not 1573 + * the SB_BORN flag. 1675 1574 */ 1676 - smp_wmb(); 1677 - sb->s_flags |= SB_BORN; 1575 + super_wake(sb, SB_BORN); 1678 1576 1679 1577 error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL); 1680 1578 if (unlikely(error)) { ··· 1817 1715 int ret; 1818 1716 1819 1717 atomic_inc(&sb->s_active); 1820 - super_lock_excl(sb); 1718 + __super_lock_excl(sb); 1821 1719 if (sb->s_writers.frozen != SB_UNFROZEN) { 1822 1720 deactivate_locked_super(sb); 1823 1721 return -EBUSY; ··· 1839 1737 /* Release s_umount to preserve sb_start_write -> s_umount ordering */ 1840 1738 super_unlock_excl(sb); 1841 1739 sb_wait_write(sb, SB_FREEZE_WRITE); 1842 - super_lock_excl(sb); 1740 + __super_lock_excl(sb); 1843 1741 1844 1742 /* Now we go and block page faults... */ 1845 1743 sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; ··· 1922 1820 */ 1923 1821 int thaw_super(struct super_block *sb) 1924 1822 { 1925 - super_lock_excl(sb); 1823 + __super_lock_excl(sb); 1926 1824 return thaw_super_locked(sb); 1927 1825 } 1928 1826 EXPORT_SYMBOL(thaw_super);
+1
include/linux/fs.h
··· 1095 1095 #define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ 1096 1096 1097 1097 /* These sb flags are internal to the kernel */ 1098 + #define SB_DYING BIT(24) 1098 1099 #define SB_SUBMOUNT BIT(26) 1099 1100 #define SB_FORCE BIT(27) 1100 1101 #define SB_NOSEC BIT(28)