Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

fs: Use rename lock and RCU for multi-step operations

The remaining usages for dcache_lock is to allow atomic, multi-step read-side
operations over the directory tree by excluding modifications to the tree.
Also, to walk in the leaf->root direction in the tree where we don't have
a natural d_lock ordering.

This could be accomplished by taking every d_lock, but this would mean a
huge number of locks and actually gets very tricky.

Solve this instead by using the rename seqlock for multi-step read-side
operations, retry in case of a rename so we don't walk up the wrong parent.
Concurrent dentry insertions are not serialised against. Concurrent deletes
are tricky when walking up the directory: our parent might have been deleted
when dropping locks so also need to check and retry for that.

We can also use the rename lock in cases where livelock is a worry (and it
is introduced in subsequent patch).

Signed-off-by: Nick Piggin <npiggin@kernel.dk>

+151 -29
+12 -3
drivers/staging/pohmelfs/path_entry.c
··· 83 83 int pohmelfs_path_length(struct pohmelfs_inode *pi) 84 84 { 85 85 struct dentry *d, *root, *first; 86 - int len = 1; /* Root slash */ 86 + int len; 87 + unsigned seq; 87 88 88 - first = d = d_find_alias(&pi->vfs_inode); 89 - if (!d) { 89 + first = d_find_alias(&pi->vfs_inode); 90 + if (!first) { 90 91 dprintk("%s: ino: %llu, mode: %o.\n", __func__, pi->ino, pi->vfs_inode.i_mode); 91 92 return -ENOENT; 92 93 } ··· 96 95 root = dget(current->fs->root.dentry); 97 96 spin_unlock(&current->fs->lock); 98 97 98 + rename_retry: 99 + len = 1; /* Root slash */ 100 + d = first; 101 + seq = read_seqbegin(&rename_lock); 102 + rcu_read_lock(); 99 103 spin_lock(&dcache_lock); 100 104 101 105 if (!IS_ROOT(d) && d_unhashed(d)) ··· 111 105 d = d->d_parent; 112 106 } 113 107 spin_unlock(&dcache_lock); 108 + rcu_read_unlock(); 109 + if (read_seqretry(&rename_lock, seq)) 110 + goto rename_retry; 114 111 115 112 dput(root); 116 113 dput(first);
+14 -2
fs/autofs4/waitq.c
··· 186 186 { 187 187 struct dentry *root = sbi->sb->s_root; 188 188 struct dentry *tmp; 189 - char *buf = *name; 189 + char *buf; 190 190 char *p; 191 - int len = 0; 191 + int len; 192 + unsigned seq; 192 193 194 + rename_retry: 195 + buf = *name; 196 + len = 0; 197 + seq = read_seqbegin(&rename_lock); 198 + rcu_read_lock(); 193 199 spin_lock(&dcache_lock); 194 200 for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) 195 201 len += tmp->d_name.len + 1; 196 202 197 203 if (!len || --len > NAME_MAX) { 198 204 spin_unlock(&dcache_lock); 205 + rcu_read_unlock(); 206 + if (read_seqretry(&rename_lock, seq)) 207 + goto rename_retry; 199 208 return 0; 200 209 } 201 210 ··· 218 209 strncpy(p, tmp->d_name.name, tmp->d_name.len); 219 210 } 220 211 spin_unlock(&dcache_lock); 212 + rcu_read_unlock(); 213 + if (read_seqretry(&rename_lock, seq)) 214 + goto rename_retry; 221 215 222 216 return len; 223 217 }
+111 -23
fs/dcache.c
··· 80 80 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock); 81 81 __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); 82 82 83 + EXPORT_SYMBOL(rename_lock); 83 84 EXPORT_SYMBOL(dcache_inode_lock); 84 85 EXPORT_SYMBOL(dcache_lock); 85 86 ··· 244 243 __releases(dcache_inode_lock) 245 244 __releases(dcache_lock) 246 245 { 246 + dentry->d_parent = NULL; 247 247 list_del(&dentry->d_u.d_child); 248 248 if (parent) 249 249 spin_unlock(&parent->d_lock); ··· 1019 1017 * Return true if the parent or its subdirectories contain 1020 1018 * a mount point 1021 1019 */ 1022 - 1023 1020 int have_submounts(struct dentry *parent) 1024 1021 { 1025 - struct dentry *this_parent = parent; 1022 + struct dentry *this_parent; 1026 1023 struct list_head *next; 1024 + unsigned seq; 1025 + 1026 + rename_retry: 1027 + this_parent = parent; 1028 + seq = read_seqbegin(&rename_lock); 1027 1029 1028 1030 spin_lock(&dcache_lock); 1029 1031 if (d_mountpoint(parent)) ··· 1061 1055 * All done at this level ... ascend and resume the search. 1062 1056 */ 1063 1057 if (this_parent != parent) { 1064 - next = this_parent->d_u.d_child.next; 1058 + struct dentry *tmp; 1059 + struct dentry *child; 1060 + 1061 + tmp = this_parent->d_parent; 1062 + rcu_read_lock(); 1065 1063 spin_unlock(&this_parent->d_lock); 1066 - this_parent = this_parent->d_parent; 1064 + child = this_parent; 1065 + this_parent = tmp; 1067 1066 spin_lock(&this_parent->d_lock); 1067 + /* might go back up the wrong parent if we have had a rename 1068 + * or deletion */ 1069 + if (this_parent != child->d_parent || 1070 + read_seqretry(&rename_lock, seq)) { 1071 + spin_unlock(&this_parent->d_lock); 1072 + spin_unlock(&dcache_lock); 1073 + rcu_read_unlock(); 1074 + goto rename_retry; 1075 + } 1076 + rcu_read_unlock(); 1077 + next = child->d_u.d_child.next; 1068 1078 goto resume; 1069 1079 } 1070 1080 spin_unlock(&this_parent->d_lock); 1071 1081 spin_unlock(&dcache_lock); 1082 + if (read_seqretry(&rename_lock, seq)) 1083 + goto rename_retry; 1072 1084 return 0; /* No mount points found in tree */ 1073 1085 positive: 1074 1086 spin_unlock(&dcache_lock); 1087 + if (read_seqretry(&rename_lock, seq)) 1088 + goto rename_retry; 1075 1089 return 1; 1076 1090 } 1077 1091 EXPORT_SYMBOL(have_submounts); ··· 1112 1086 */ 1113 1087 static int select_parent(struct dentry * parent) 1114 1088 { 1115 - struct dentry *this_parent = parent; 1089 + struct dentry *this_parent; 1116 1090 struct list_head *next; 1091 + unsigned seq; 1117 1092 int found = 0; 1093 + 1094 + rename_retry: 1095 + this_parent = parent; 1096 + seq = read_seqbegin(&rename_lock); 1118 1097 1119 1098 spin_lock(&dcache_lock); 1120 1099 spin_lock(&this_parent->d_lock); ··· 1130 1099 struct list_head *tmp = next; 1131 1100 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); 1132 1101 next = tmp->next; 1133 - BUG_ON(this_parent == dentry); 1134 1102 1135 1103 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 1136 1104 ··· 1172 1142 */ 1173 1143 if (this_parent != parent) { 1174 1144 struct dentry *tmp; 1175 - next = this_parent->d_u.d_child.next; 1145 + struct dentry *child; 1146 + 1176 1147 tmp = this_parent->d_parent; 1148 + rcu_read_lock(); 1177 1149 spin_unlock(&this_parent->d_lock); 1178 - BUG_ON(tmp == this_parent); 1150 + child = this_parent; 1179 1151 this_parent = tmp; 1180 1152 spin_lock(&this_parent->d_lock); 1153 + /* might go back up the wrong parent if we have had a rename 1154 + * or deletion */ 1155 + if (this_parent != child->d_parent || 1156 + read_seqretry(&rename_lock, seq)) { 1157 + spin_unlock(&this_parent->d_lock); 1158 + spin_unlock(&dcache_lock); 1159 + rcu_read_unlock(); 1160 + goto rename_retry; 1161 + } 1162 + rcu_read_unlock(); 1163 + next = child->d_u.d_child.next; 1181 1164 goto resume; 1182 1165 } 1183 1166 out: 1184 1167 spin_unlock(&this_parent->d_lock); 1185 1168 spin_unlock(&dcache_lock); 1169 + if (read_seqretry(&rename_lock, seq)) 1170 + goto rename_retry; 1186 1171 return found; 1187 1172 } 1188 1173 ··· 1699 1654 struct dentry * d_lookup(struct dentry * parent, struct qstr * name) 1700 1655 { 1701 1656 struct dentry * dentry = NULL; 1702 - unsigned long seq; 1657 + unsigned seq; 1703 1658 1704 1659 do { 1705 1660 seq = read_seqbegin(&rename_lock); ··· 2335 2290 * @buffer: pointer to the end of the buffer 2336 2291 * @buflen: pointer to buffer length 2337 2292 * 2338 - * Caller holds the dcache_lock. 2293 + * Caller holds the rename_lock. 2339 2294 * 2340 2295 * If path is not reachable from the supplied root, then the value of 2341 2296 * root is changed (without modifying refcounts). ··· 2422 2377 2423 2378 prepend(&res, &buflen, "\0", 1); 2424 2379 spin_lock(&dcache_lock); 2380 + write_seqlock(&rename_lock); 2425 2381 error = prepend_path(path, root, &res, &buflen); 2382 + write_sequnlock(&rename_lock); 2426 2383 spin_unlock(&dcache_lock); 2427 2384 2428 2385 if (error) ··· 2488 2441 2489 2442 get_fs_root(current->fs, &root); 2490 2443 spin_lock(&dcache_lock); 2444 + write_seqlock(&rename_lock); 2491 2445 tmp = root; 2492 2446 error = path_with_deleted(path, &tmp, &res, &buflen); 2493 2447 if (error) 2494 2448 res = ERR_PTR(error); 2449 + write_sequnlock(&rename_lock); 2495 2450 spin_unlock(&dcache_lock); 2496 2451 path_put(&root); 2497 2452 return res; ··· 2521 2472 2522 2473 get_fs_root(current->fs, &root); 2523 2474 spin_lock(&dcache_lock); 2475 + write_seqlock(&rename_lock); 2524 2476 tmp = root; 2525 2477 error = path_with_deleted(path, &tmp, &res, &buflen); 2526 2478 if (!error && !path_equal(&tmp, &root)) 2527 2479 error = prepend_unreachable(&res, &buflen); 2480 + write_sequnlock(&rename_lock); 2528 2481 spin_unlock(&dcache_lock); 2529 2482 path_put(&root); 2530 2483 if (error) ··· 2595 2544 char *retval; 2596 2545 2597 2546 spin_lock(&dcache_lock); 2547 + write_seqlock(&rename_lock); 2598 2548 retval = __dentry_path(dentry, buf, buflen); 2549 + write_sequnlock(&rename_lock); 2599 2550 spin_unlock(&dcache_lock); 2600 2551 2601 2552 return retval; ··· 2610 2557 char *retval; 2611 2558 2612 2559 spin_lock(&dcache_lock); 2560 + write_seqlock(&rename_lock); 2613 2561 if (d_unlinked(dentry)) { 2614 2562 p = buf + buflen; 2615 2563 if (prepend(&p, &buflen, "//deleted", 10) != 0) ··· 2618 2564 buflen++; 2619 2565 } 2620 2566 retval = __dentry_path(dentry, buf, buflen); 2567 + write_sequnlock(&rename_lock); 2621 2568 spin_unlock(&dcache_lock); 2622 2569 if (!IS_ERR(retval) && p) 2623 2570 *p = '/'; /* restore '/' overriden with '\0' */ ··· 2659 2604 2660 2605 error = -ENOENT; 2661 2606 spin_lock(&dcache_lock); 2607 + write_seqlock(&rename_lock); 2662 2608 if (!d_unlinked(pwd.dentry)) { 2663 2609 unsigned long len; 2664 2610 struct path tmp = root; ··· 2668 2612 2669 2613 prepend(&cwd, &buflen, "\0", 1); 2670 2614 error = prepend_path(&pwd, &tmp, &cwd, &buflen); 2615 + write_sequnlock(&rename_lock); 2671 2616 spin_unlock(&dcache_lock); 2672 2617 2673 2618 if (error) ··· 2688 2631 if (copy_to_user(buf, cwd, len)) 2689 2632 error = -EFAULT; 2690 2633 } 2691 - } else 2634 + } else { 2635 + write_sequnlock(&rename_lock); 2692 2636 spin_unlock(&dcache_lock); 2637 + } 2693 2638 2694 2639 out: 2695 2640 path_put(&pwd); ··· 2719 2660 int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) 2720 2661 { 2721 2662 int result; 2722 - unsigned long seq; 2663 + unsigned seq; 2723 2664 2724 2665 if (new_dentry == old_dentry) 2725 2666 return 1; 2726 2667 2727 - /* 2728 - * Need rcu_readlock to protect against the d_parent trashing 2729 - * due to d_move 2730 - */ 2731 - rcu_read_lock(); 2732 2668 do { 2733 2669 /* for restarting inner loop in case of seq retry */ 2734 2670 seq = read_seqbegin(&rename_lock); 2671 + /* 2672 + * Need rcu_readlock to protect against the d_parent trashing 2673 + * due to d_move 2674 + */ 2675 + rcu_read_lock(); 2735 2676 if (d_ancestor(old_dentry, new_dentry)) 2736 2677 result = 1; 2737 2678 else 2738 2679 result = 0; 2680 + rcu_read_unlock(); 2739 2681 } while (read_seqretry(&rename_lock, seq)); 2740 - rcu_read_unlock(); 2741 2682 2742 2683 return result; 2743 2684 } ··· 2769 2710 2770 2711 void d_genocide(struct dentry *root) 2771 2712 { 2772 - struct dentry *this_parent = root; 2713 + struct dentry *this_parent; 2773 2714 struct list_head *next; 2715 + unsigned seq; 2774 2716 2717 + rename_retry: 2718 + this_parent = root; 2719 + seq = read_seqbegin(&rename_lock); 2775 2720 spin_lock(&dcache_lock); 2776 2721 spin_lock(&this_parent->d_lock); 2777 2722 repeat: ··· 2785 2722 struct list_head *tmp = next; 2786 2723 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); 2787 2724 next = tmp->next; 2725 + 2788 2726 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 2789 2727 if (d_unhashed(dentry) || !dentry->d_inode) { 2790 2728 spin_unlock(&dentry->d_lock); ··· 2798 2734 spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); 2799 2735 goto repeat; 2800 2736 } 2801 - dentry->d_count--; 2737 + if (!(dentry->d_flags & DCACHE_GENOCIDE)) { 2738 + dentry->d_flags |= DCACHE_GENOCIDE; 2739 + dentry->d_count--; 2740 + } 2802 2741 spin_unlock(&dentry->d_lock); 2803 2742 } 2804 2743 if (this_parent != root) { 2805 - next = this_parent->d_u.d_child.next; 2806 - this_parent->d_count--; 2744 + struct dentry *tmp; 2745 + struct dentry *child; 2746 + 2747 + tmp = this_parent->d_parent; 2748 + if (!(this_parent->d_flags & DCACHE_GENOCIDE)) { 2749 + this_parent->d_flags |= DCACHE_GENOCIDE; 2750 + this_parent->d_count--; 2751 + } 2752 + rcu_read_lock(); 2807 2753 spin_unlock(&this_parent->d_lock); 2808 - this_parent = this_parent->d_parent; 2754 + child = this_parent; 2755 + this_parent = tmp; 2809 2756 spin_lock(&this_parent->d_lock); 2757 + /* might go back up the wrong parent if we have had a rename 2758 + * or deletion */ 2759 + if (this_parent != child->d_parent || 2760 + read_seqretry(&rename_lock, seq)) { 2761 + spin_unlock(&this_parent->d_lock); 2762 + spin_unlock(&dcache_lock); 2763 + rcu_read_unlock(); 2764 + goto rename_retry; 2765 + } 2766 + rcu_read_unlock(); 2767 + next = child->d_u.d_child.next; 2810 2768 goto resume; 2811 2769 } 2812 2770 spin_unlock(&this_parent->d_lock); 2813 2771 spin_unlock(&dcache_lock); 2772 + if (read_seqretry(&rename_lock, seq)) 2773 + goto rename_retry; 2814 2774 } 2815 2775 2816 2776 /**
+13 -1
fs/nfs/namespace.c
··· 49 49 const struct dentry *dentry, 50 50 char *buffer, ssize_t buflen) 51 51 { 52 - char *end = buffer+buflen; 52 + char *end; 53 53 int namelen; 54 + unsigned seq; 54 55 56 + rename_retry: 57 + end = buffer+buflen; 55 58 *--end = '\0'; 56 59 buflen--; 60 + 61 + seq = read_seqbegin(&rename_lock); 62 + rcu_read_lock(); 57 63 spin_lock(&dcache_lock); 58 64 while (!IS_ROOT(dentry) && dentry != droot) { 59 65 namelen = dentry->d_name.len; ··· 72 66 dentry = dentry->d_parent; 73 67 } 74 68 spin_unlock(&dcache_lock); 69 + rcu_read_unlock(); 70 + if (read_seqretry(&rename_lock, seq)) 71 + goto rename_retry; 75 72 if (*end != '/') { 76 73 if (--buflen < 0) 77 74 goto Elong; ··· 92 83 return end; 93 84 Elong_unlock: 94 85 spin_unlock(&dcache_lock); 86 + rcu_read_unlock(); 87 + if (read_seqretry(&rename_lock, seq)) 88 + goto rename_retry; 95 89 Elong: 96 90 return ERR_PTR(-ENAMETOOLONG); 97 91 }
+1
include/linux/dcache.h
··· 180 180 #define DCACHE_FSNOTIFY_PARENT_WATCHED 0x0080 /* Parent inode is watched by some fsnotify listener */ 181 181 182 182 #define DCACHE_CANT_MOUNT 0x0100 183 + #define DCACHE_GENOCIDE 0x0200 183 184 184 185 extern spinlock_t dcache_inode_lock; 185 186 extern spinlock_t dcache_lock;