Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

kernfs: Use RCU to access kernfs_node::name.

Using RCU lifetime rules to access kernfs_node::name can avoid the
trouble with kernfs_rename_lock in kernfs_name() and kernfs_path_from_node()
if the fs was created with KERNFS_ROOT_INVARIANT_PARENT. This is usefull
as it allows to implement kernfs_path_from_node() only with RCU
protection and avoiding kernfs_rename_lock. The lock is only required if
the __parent node can be changed and the function requires an unchanged
hierarchy while it iterates from the node to its parent.
The change is needed to allow the lookup of the node's path
(kernfs_path_from_node()) from context which runs always with disabled
preemption and or interrutps even on PREEMPT_RT. The problem is that
kernfs_rename_lock becomes a sleeping lock on PREEMPT_RT.

I went through all ::name users and added the required access for the lookup
with a few extensions:
- rdtgroup_pseudo_lock_create() drops all locks and then uses the name
later on. resctrl supports rename with different parents. Here I made
a temporal copy of the name while it is used outside of the lock.

- kernfs_rename_ns() accepts NULL as new_parent. This simplifies
sysfs_move_dir_ns() where it can set NULL in order to reuse the current
name.

- kernfs_rename_ns() is only using kernfs_rename_lock if the parents are
different. All users use either kernfs_rwsem (for stable path view) or
just RCU for the lookup. The ::name uses always RCU free.

Use RCU lifetime guarantees to access kernfs_node::name.

Suggested-by: Tejun Heo <tj@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Reported-by: syzbot+6ea37e2e6ffccf41a7e6@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/lkml/67251dc6.050a0220.529b6.015e.GAE@google.com/
Reported-by: Hillf Danton <hdanton@sina.com>
Closes: https://lore.kernel.org/20241102001224.2789-1-hdanton@sina.com
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lore.kernel.org/r/20250213145023.2820193-7-bigeasy@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

authored by

Sebastian Andrzej Siewior and committed by
Greg Kroah-Hartman
741c10b0 63348894

+105 -71
+5
arch/x86/kernel/cpu/resctrl/internal.h
··· 507 507 508 508 extern struct mutex rdtgroup_mutex; 509 509 510 + static inline const char *rdt_kn_name(const struct kernfs_node *kn) 511 + { 512 + return rcu_dereference_check(kn->name, lockdep_is_held(&rdtgroup_mutex)); 513 + } 514 + 510 515 extern struct rdt_hw_resource rdt_resources_all[]; 511 516 extern struct rdtgroup rdtgroup_default; 512 517 extern struct dentry *debugfs_resctrl;
+10 -4
arch/x86/kernel/cpu/resctrl/pseudo_lock.c
··· 52 52 rdtgrp = dev_get_drvdata(dev); 53 53 if (mode) 54 54 *mode = 0600; 55 - return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name); 55 + guard(mutex)(&rdtgroup_mutex); 56 + return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdt_kn_name(rdtgrp->kn)); 56 57 } 57 58 58 59 static const struct class pseudo_lock_class = { ··· 1294 1293 struct task_struct *thread; 1295 1294 unsigned int new_minor; 1296 1295 struct device *dev; 1296 + char *kn_name __free(kfree) = NULL; 1297 1297 int ret; 1298 1298 1299 1299 ret = pseudo_lock_region_alloc(plr); ··· 1305 1303 if (ret < 0) { 1306 1304 ret = -EINVAL; 1307 1305 goto out_region; 1306 + } 1307 + kn_name = kstrdup(rdt_kn_name(rdtgrp->kn), GFP_KERNEL); 1308 + if (!kn_name) { 1309 + ret = -ENOMEM; 1310 + goto out_cstates; 1308 1311 } 1309 1312 1310 1313 plr->thread_done = 0; ··· 1355 1348 mutex_unlock(&rdtgroup_mutex); 1356 1349 1357 1350 if (!IS_ERR_OR_NULL(debugfs_resctrl)) { 1358 - plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name, 1359 - debugfs_resctrl); 1351 + plr->debugfs_dir = debugfs_create_dir(kn_name, debugfs_resctrl); 1360 1352 if (!IS_ERR_OR_NULL(plr->debugfs_dir)) 1361 1353 debugfs_create_file("pseudo_lock_measure", 0200, 1362 1354 plr->debugfs_dir, rdtgrp, ··· 1364 1358 1365 1359 dev = device_create(&pseudo_lock_class, NULL, 1366 1360 MKDEV(pseudo_lock_major, new_minor), 1367 - rdtgrp, "%s", rdtgrp->kn->name); 1361 + rdtgrp, "%s", kn_name); 1368 1362 1369 1363 mutex_lock(&rdtgroup_mutex); 1370 1364
+5 -5
arch/x86/kernel/cpu/resctrl/rdtgroup.c
··· 916 916 continue; 917 917 918 918 seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", 919 - rdtg->kn->name); 919 + rdt_kn_name(rdtg->kn)); 920 920 seq_puts(s, "mon:"); 921 921 list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, 922 922 mon.crdtgrp_list) { 923 923 if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid, 924 924 crg->mon.rmid)) 925 925 continue; 926 - seq_printf(s, "%s", crg->kn->name); 926 + seq_printf(s, "%s", rdt_kn_name(crg->kn)); 927 927 break; 928 928 } 929 929 seq_putc(s, '\n'); ··· 3675 3675 */ 3676 3676 static bool is_mon_groups(struct kernfs_node *kn, const char *name) 3677 3677 { 3678 - return (!strcmp(kn->name, "mon_groups") && 3678 + return (!strcmp(rdt_kn_name(kn), "mon_groups") && 3679 3679 strcmp(name, "mon_groups")); 3680 3680 } 3681 3681 ··· 3824 3824 ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask); 3825 3825 } 3826 3826 } else if (rdtgrp->type == RDTMON_GROUP && 3827 - is_mon_groups(parent_kn, kn->name)) { 3827 + is_mon_groups(parent_kn, rdt_kn_name(kn))) { 3828 3828 ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask); 3829 3829 } else { 3830 3830 ret = -EPERM; ··· 3912 3912 3913 3913 kn_parent = rdt_kn_parent(kn); 3914 3914 if (rdtgrp->type != RDTMON_GROUP || !kn_parent || 3915 - !is_mon_groups(kn_parent, kn->name)) { 3915 + !is_mon_groups(kn_parent, rdt_kn_name(kn))) { 3916 3916 rdt_last_cmd_puts("Source must be a MON group\n"); 3917 3917 ret = -EPERM; 3918 3918 goto out;
+62 -51
fs/kernfs/dir.c
··· 51 51 #endif 52 52 } 53 53 54 - static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen) 55 - { 56 - if (!kn) 57 - return strscpy(buf, "(null)", buflen); 58 - 59 - return strscpy(buf, rcu_access_pointer(kn->__parent) ? kn->name : "/", buflen); 60 - } 61 - 62 54 /* kernfs_node_depth - compute depth from @from to @to */ 63 55 static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to) 64 56 { ··· 160 168 161 169 /* Calculate how many bytes we need for the rest */ 162 170 for (i = depth_to - 1; i >= 0; i--) { 171 + const char *name; 163 172 164 173 for (kn = kn_to, j = 0; j < i; j++) 165 174 kn = rcu_dereference(kn->__parent); 166 175 167 - len += scnprintf(buf + len, buflen - len, "/%s", kn->name); 176 + name = rcu_dereference(kn->name); 177 + len += scnprintf(buf + len, buflen - len, "/%s", name); 168 178 } 169 179 170 180 return len; ··· 190 196 */ 191 197 int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) 192 198 { 193 - unsigned long flags; 194 - int ret; 199 + struct kernfs_node *kn_parent; 195 200 196 - read_lock_irqsave(&kernfs_rename_lock, flags); 197 - ret = kernfs_name_locked(kn, buf, buflen); 198 - read_unlock_irqrestore(&kernfs_rename_lock, flags); 199 - return ret; 201 + if (!kn) 202 + return strscpy(buf, "(null)", buflen); 203 + 204 + guard(rcu)(); 205 + /* 206 + * KERNFS_ROOT_INVARIANT_PARENT is ignored here. The name is RCU freed and 207 + * the parent is either existing or not. 208 + */ 209 + kn_parent = rcu_dereference(kn->__parent); 210 + return strscpy(buf, kn_parent ? rcu_dereference(kn->name) : "/", buflen); 200 211 } 201 212 202 213 /** ··· 223 224 int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, 224 225 char *buf, size_t buflen) 225 226 { 226 - unsigned long flags; 227 - int ret; 227 + struct kernfs_root *root; 228 228 229 229 guard(rcu)(); 230 - read_lock_irqsave(&kernfs_rename_lock, flags); 231 - ret = kernfs_path_from_node_locked(to, from, buf, buflen); 232 - read_unlock_irqrestore(&kernfs_rename_lock, flags); 233 - return ret; 230 + if (to) { 231 + root = kernfs_root(to); 232 + if (!(root->flags & KERNFS_ROOT_INVARIANT_PARENT)) { 233 + guard(read_lock_irqsave)(&kernfs_rename_lock); 234 + return kernfs_path_from_node_locked(to, from, buf, buflen); 235 + } 236 + } 237 + return kernfs_path_from_node_locked(to, from, buf, buflen); 234 238 } 235 239 EXPORT_SYMBOL_GPL(kernfs_path_from_node); 236 240 ··· 340 338 return -1; 341 339 if (ns > kn->ns) 342 340 return 1; 343 - return strcmp(name, kn->name); 341 + return strcmp(name, kernfs_rcu_name(kn)); 344 342 } 345 343 346 344 static int kernfs_sd_compare(const struct kernfs_node *left, 347 345 const struct kernfs_node *right) 348 346 { 349 - return kernfs_name_compare(left->hash, left->name, left->ns, right); 347 + return kernfs_name_compare(left->hash, kernfs_rcu_name(left), left->ns, right); 350 348 } 351 349 352 350 /** ··· 544 542 { 545 543 struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu); 546 544 547 - kfree_const(kn->name); 545 + /* If the whole node goes away, then name can't be used outside */ 546 + kfree_const(rcu_access_pointer(kn->name)); 548 547 549 548 if (kn->iattr) { 550 549 simple_xattrs_free(&kn->iattr->xattrs, NULL); ··· 578 575 579 576 WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS, 580 577 "kernfs_put: %s/%s: released with incorrect active_ref %d\n", 581 - parent ? parent->name : "", kn->name, atomic_read(&kn->active)); 578 + parent ? rcu_dereference(parent->name) : "", 579 + rcu_dereference(kn->name), atomic_read(&kn->active)); 582 580 583 581 if (kernfs_type(kn) == KERNFS_LINK) 584 582 kernfs_put(kn->symlink.target_kn); ··· 656 652 atomic_set(&kn->active, KN_DEACTIVATED_BIAS); 657 653 RB_CLEAR_NODE(&kn->rb); 658 654 659 - kn->name = name; 655 + rcu_assign_pointer(kn->name, name); 660 656 kn->mode = mode; 661 657 kn->flags = flags; 662 658 ··· 794 790 ret = -EINVAL; 795 791 has_ns = kernfs_ns_enabled(parent); 796 792 if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", 797 - has_ns ? "required" : "invalid", parent->name, kn->name)) 793 + has_ns ? "required" : "invalid", 794 + kernfs_rcu_name(parent), kernfs_rcu_name(kn))) 798 795 goto out_unlock; 799 796 800 797 if (kernfs_type(parent) != KERNFS_DIR) ··· 805 800 if (parent->flags & (KERNFS_REMOVING | KERNFS_EMPTY_DIR)) 806 801 goto out_unlock; 807 802 808 - kn->hash = kernfs_name_hash(kn->name, kn->ns); 803 + kn->hash = kernfs_name_hash(kernfs_rcu_name(kn), kn->ns); 809 804 810 805 ret = kernfs_link_sibling(kn); 811 806 if (ret) ··· 861 856 862 857 if (has_ns != (bool)ns) { 863 858 WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", 864 - has_ns ? "required" : "invalid", parent->name, name); 859 + has_ns ? "required" : "invalid", kernfs_rcu_name(parent), name); 865 860 return NULL; 866 861 } 867 862 ··· 1140 1135 1141 1136 /* Negative hashed dentry? */ 1142 1137 if (d_really_is_negative(dentry)) { 1143 - struct kernfs_node *parent; 1144 - 1145 1138 /* If the kernfs parent node has changed discard and 1146 1139 * proceed to ->lookup. 1147 1140 * ··· 1187 1184 goto out_bad; 1188 1185 1189 1186 /* The kernfs node has been renamed */ 1190 - if (strcmp(dentry->d_name.name, kn->name) != 0) 1187 + if (strcmp(dentry->d_name.name, kernfs_rcu_name(kn)) != 0) 1191 1188 goto out_bad; 1192 1189 1193 1190 /* The kernfs node has been moved to a different namespace */ ··· 1481 1478 if (kernfs_parent(kn) && RB_EMPTY_NODE(&kn->rb)) 1482 1479 return; 1483 1480 1484 - pr_debug("kernfs %s: removing\n", kn->name); 1481 + pr_debug("kernfs %s: removing\n", kernfs_rcu_name(kn)); 1485 1482 1486 1483 /* prevent new usage by marking all nodes removing and deactivating */ 1487 1484 pos = NULL; ··· 1737 1734 { 1738 1735 struct kernfs_node *old_parent; 1739 1736 struct kernfs_root *root; 1740 - const char *old_name = NULL; 1737 + const char *old_name; 1741 1738 int error; 1742 1739 1743 1740 /* can't move or rename root */ ··· 1760 1757 } 1761 1758 1762 1759 error = 0; 1760 + old_name = kernfs_rcu_name(kn); 1761 + if (!new_name) 1762 + new_name = old_name; 1763 1763 if ((old_parent == new_parent) && (kn->ns == new_ns) && 1764 - (strcmp(kn->name, new_name) == 0)) 1764 + (strcmp(old_name, new_name) == 0)) 1765 1765 goto out; /* nothing to rename */ 1766 1766 1767 1767 error = -EEXIST; ··· 1772 1766 goto out; 1773 1767 1774 1768 /* rename kernfs_node */ 1775 - if (strcmp(kn->name, new_name) != 0) { 1769 + if (strcmp(old_name, new_name) != 0) { 1776 1770 error = -ENOMEM; 1777 1771 new_name = kstrdup_const(new_name, GFP_KERNEL); 1778 1772 if (!new_name) ··· 1785 1779 * Move to the appropriate place in the appropriate directories rbtree. 1786 1780 */ 1787 1781 kernfs_unlink_sibling(kn); 1788 - kernfs_get(new_parent); 1789 1782 1790 - /* rename_lock protects ->parent and ->name accessors */ 1791 - write_lock_irq(&kernfs_rename_lock); 1783 + /* rename_lock protects ->parent accessors */ 1784 + if (old_parent != new_parent) { 1785 + kernfs_get(new_parent); 1786 + write_lock_irq(&kernfs_rename_lock); 1792 1787 1793 - old_parent = kernfs_parent(kn); 1794 - rcu_assign_pointer(kn->__parent, new_parent); 1788 + rcu_assign_pointer(kn->__parent, new_parent); 1795 1789 1796 - kn->ns = new_ns; 1797 - if (new_name) { 1798 - old_name = kn->name; 1799 - kn->name = new_name; 1790 + kn->ns = new_ns; 1791 + if (new_name) 1792 + rcu_assign_pointer(kn->name, new_name); 1793 + 1794 + write_unlock_irq(&kernfs_rename_lock); 1795 + kernfs_put(old_parent); 1796 + } else { 1797 + /* name assignment is RCU protected, parent is the same */ 1798 + kn->ns = new_ns; 1799 + if (new_name) 1800 + rcu_assign_pointer(kn->name, new_name); 1800 1801 } 1801 1802 1802 - write_unlock_irq(&kernfs_rename_lock); 1803 - 1804 - kn->hash = kernfs_name_hash(kn->name, kn->ns); 1803 + kn->hash = kernfs_name_hash(new_name ?: old_name, kn->ns); 1805 1804 kernfs_link_sibling(kn); 1806 1805 1807 - kernfs_put(old_parent); 1808 - kfree_const(old_name); 1806 + if (new_name && !is_kernel_rodata((unsigned long)old_name)) 1807 + kfree_rcu_mightsleep(old_name); 1809 1808 1810 1809 error = 0; 1811 1810 out: ··· 1895 1884 for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos); 1896 1885 pos; 1897 1886 pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) { 1898 - const char *name = pos->name; 1887 + const char *name = kernfs_rcu_name(pos); 1899 1888 unsigned int type = fs_umode_to_dtype(pos->mode); 1900 1889 int len = strlen(name); 1901 1890 ino_t ino = kernfs_ino(pos);
+3 -1
fs/kernfs/file.c
··· 915 915 list_for_each_entry(info, &kernfs_root(kn)->supers, node) { 916 916 struct kernfs_node *parent; 917 917 struct inode *p_inode = NULL; 918 + const char *kn_name; 918 919 struct inode *inode; 919 920 struct qstr name; 920 921 ··· 929 928 if (!inode) 930 929 continue; 931 930 932 - name = QSTR(kn->name); 931 + kn_name = kernfs_rcu_name(kn); 932 + name = QSTR(kn_name); 933 933 parent = kernfs_get_parent(kn); 934 934 if (parent) { 935 935 p_inode = ilookup(info->sb, kernfs_ino(parent));
+5
fs/kernfs/kernfs-internal.h
··· 107 107 return lockdep_is_held(&kernfs_root(kn)->kernfs_rwsem); 108 108 } 109 109 110 + static inline const char *kernfs_rcu_name(const struct kernfs_node *kn) 111 + { 112 + return rcu_dereference_check(kn->name, kernfs_root_is_locked(kn)); 113 + } 114 + 110 115 static inline struct kernfs_node *kernfs_parent(const struct kernfs_node *kn) 111 116 { 112 117 /*
+3 -2
fs/kernfs/mount.c
··· 231 231 do { 232 232 struct dentry *dtmp; 233 233 struct kernfs_node *kntmp; 234 + const char *name; 234 235 235 236 if (kn == knparent) 236 237 return dentry; ··· 240 239 dput(dentry); 241 240 return ERR_PTR(-EINVAL); 242 241 } 243 - dtmp = lookup_positive_unlocked(kntmp->name, dentry, 244 - strlen(kntmp->name)); 242 + name = rcu_dereference(kntmp->name); 243 + dtmp = lookup_positive_unlocked(name, dentry, strlen(name)); 245 244 dput(dentry); 246 245 if (IS_ERR(dtmp)) 247 246 return dtmp;
+4 -3
fs/kernfs/symlink.c
··· 81 81 /* determine end of target string for reverse fillup */ 82 82 kn = target; 83 83 while (kernfs_parent(kn) && kn != base) { 84 - len += strlen(kn->name) + 1; 84 + len += strlen(kernfs_rcu_name(kn)) + 1; 85 85 kn = kernfs_parent(kn); 86 86 } 87 87 ··· 95 95 /* reverse fillup of target string from target to base */ 96 96 kn = target; 97 97 while (kernfs_parent(kn) && kn != base) { 98 - int slen = strlen(kn->name); 98 + const char *name = kernfs_rcu_name(kn); 99 + int slen = strlen(name); 99 100 100 101 len -= slen; 101 - memcpy(s + len, kn->name, slen); 102 + memcpy(s + len, name, slen); 102 103 if (len) 103 104 s[--len] = '/'; 104 105
+1 -1
fs/sysfs/dir.c
··· 123 123 new_parent = new_parent_kobj && new_parent_kobj->sd ? 124 124 new_parent_kobj->sd : sysfs_root_kn; 125 125 126 - return kernfs_rename_ns(kn, new_parent, kn->name, new_ns); 126 + return kernfs_rename_ns(kn, new_parent, NULL, new_ns); 127 127 } 128 128 129 129 /**
+2 -2
include/linux/kernfs.h
··· 204 204 * never moved to a different parent, it is safe to access the 205 205 * parent directly. 206 206 */ 207 - const char *name; 208 207 struct kernfs_node __rcu *__parent; 208 + const char __rcu *name; 209 209 210 210 struct rb_node rb; 211 211 ··· 400 400 } 401 401 402 402 int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen); 403 - int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn, 403 + int kernfs_path_from_node(struct kernfs_node *kn_to, struct kernfs_node *kn_from, 404 404 char *buf, size_t buflen); 405 405 void pr_cont_kernfs_name(struct kernfs_node *kn); 406 406 void pr_cont_kernfs_path(struct kernfs_node *kn);
+5 -2
security/selinux/hooks.c
··· 3584 3584 newsid = tsec->create_sid; 3585 3585 } else { 3586 3586 u16 secclass = inode_mode_to_security_class(kn->mode); 3587 + const char *kn_name; 3587 3588 struct qstr q; 3588 3589 3589 - q.name = kn->name; 3590 - q.hash_len = hashlen_string(kn_dir, kn->name); 3590 + /* kn is fresh, can't be renamed, name goes not away */ 3591 + kn_name = rcu_dereference_check(kn->name, true); 3592 + q.name = kn_name; 3593 + q.hash_len = hashlen_string(kn_dir, kn_name); 3591 3594 3592 3595 rc = security_transition_sid(tsec->sid, 3593 3596 parent_sid, secclass, &q,