Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

NFSD: Convert filecache to rhltable

While we were converting the nfs4_file hashtable to use the kernel's
resizable hashtable data structure, Neil Brown observed that the
list variant (rhltable) would be better for managing nfsd_file items
as well. The nfsd_file hash table will contain multiple entries for
the same inode -- these should be kept together on a list. And, it
could be possible for exotic or malicious client behavior to cause
the hash table to resize itself on every insertion.

A nice simplification is that rhltable_lookup() can return a list
that contains only nfsd_file items that match a given inode, which
enables us to eliminate specialized hash table helper functions and
use the default functions provided by the rhashtable implementation).

Since we are now storing nfsd_file items for the same inode on a
single list, that effectively reduces the number of hash entries
that have to be tracked in the hash table. The mininum bucket count
is therefore lowered.

Light testing with fstests generic/531 show no regressions.

Suggested-by: Neil Brown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>

+133 -187
+128 -183
fs/nfsd/filecache.c
··· 74 74 static unsigned long nfsd_file_flags; 75 75 static struct fsnotify_group *nfsd_file_fsnotify_group; 76 76 static struct delayed_work nfsd_filecache_laundrette; 77 - static struct rhashtable nfsd_file_rhash_tbl 77 + static struct rhltable nfsd_file_rhltable 78 78 ____cacheline_aligned_in_smp; 79 - 80 - enum nfsd_file_lookup_type { 81 - NFSD_FILE_KEY_INODE, 82 - NFSD_FILE_KEY_FULL, 83 - }; 84 - 85 - struct nfsd_file_lookup_key { 86 - struct inode *inode; 87 - struct net *net; 88 - const struct cred *cred; 89 - unsigned char need; 90 - bool gc; 91 - enum nfsd_file_lookup_type type; 92 - }; 93 - 94 - /* 95 - * The returned hash value is based solely on the address of an in-code 96 - * inode, a pointer to a slab-allocated object. The entropy in such a 97 - * pointer is concentrated in its middle bits. 98 - */ 99 - static u32 nfsd_file_inode_hash(const struct inode *inode, u32 seed) 100 - { 101 - unsigned long ptr = (unsigned long)inode; 102 - u32 k; 103 - 104 - k = ptr >> L1_CACHE_SHIFT; 105 - k &= 0x00ffffff; 106 - return jhash2(&k, 1, seed); 107 - } 108 - 109 - /** 110 - * nfsd_file_key_hashfn - Compute the hash value of a lookup key 111 - * @data: key on which to compute the hash value 112 - * @len: rhash table's key_len parameter (unused) 113 - * @seed: rhash table's random seed of the day 114 - * 115 - * Return value: 116 - * Computed 32-bit hash value 117 - */ 118 - static u32 nfsd_file_key_hashfn(const void *data, u32 len, u32 seed) 119 - { 120 - const struct nfsd_file_lookup_key *key = data; 121 - 122 - return nfsd_file_inode_hash(key->inode, seed); 123 - } 124 - 125 - /** 126 - * nfsd_file_obj_hashfn - Compute the hash value of an nfsd_file 127 - * @data: object on which to compute the hash value 128 - * @len: rhash table's key_len parameter (unused) 129 - * @seed: rhash table's random seed of the day 130 - * 131 - * Return value: 132 - * Computed 32-bit hash value 133 - */ 134 - static u32 nfsd_file_obj_hashfn(const void *data, u32 len, u32 seed) 135 - { 136 - const struct nfsd_file *nf = data; 137 - 138 - return nfsd_file_inode_hash(nf->nf_inode, seed); 139 - } 140 79 141 80 static bool 142 81 nfsd_match_cred(const struct cred *c1, const struct cred *c2) ··· 97 158 return true; 98 159 } 99 160 100 - /** 101 - * nfsd_file_obj_cmpfn - Match a cache item against search criteria 102 - * @arg: search criteria 103 - * @ptr: cache item to check 104 - * 105 - * Return values: 106 - * %0 - Item matches search criteria 107 - * %1 - Item does not match search criteria 108 - */ 109 - static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg, 110 - const void *ptr) 111 - { 112 - const struct nfsd_file_lookup_key *key = arg->key; 113 - const struct nfsd_file *nf = ptr; 114 - 115 - switch (key->type) { 116 - case NFSD_FILE_KEY_INODE: 117 - if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc) 118 - return 1; 119 - if (nf->nf_inode != key->inode) 120 - return 1; 121 - break; 122 - case NFSD_FILE_KEY_FULL: 123 - if (nf->nf_inode != key->inode) 124 - return 1; 125 - if (nf->nf_may != key->need) 126 - return 1; 127 - if (nf->nf_net != key->net) 128 - return 1; 129 - if (!nfsd_match_cred(nf->nf_cred, key->cred)) 130 - return 1; 131 - if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc) 132 - return 1; 133 - if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) 134 - return 1; 135 - break; 136 - } 137 - return 0; 138 - } 139 - 140 161 static const struct rhashtable_params nfsd_file_rhash_params = { 141 162 .key_len = sizeof_field(struct nfsd_file, nf_inode), 142 163 .key_offset = offsetof(struct nfsd_file, nf_inode), 143 - .head_offset = offsetof(struct nfsd_file, nf_rhash), 144 - .hashfn = nfsd_file_key_hashfn, 145 - .obj_hashfn = nfsd_file_obj_hashfn, 146 - .obj_cmpfn = nfsd_file_obj_cmpfn, 147 - /* Reduce resizing churn on light workloads */ 148 - .min_size = 512, /* buckets */ 164 + .head_offset = offsetof(struct nfsd_file, nf_rlist), 165 + 166 + /* 167 + * Start with a single page hash table to reduce resizing churn 168 + * on light workloads. 169 + */ 170 + .min_size = 256, 149 171 .automatic_shrinking = true, 150 172 }; 151 173 ··· 209 309 } 210 310 211 311 static struct nfsd_file * 212 - nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may) 312 + nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need, 313 + bool want_gc) 213 314 { 214 315 struct nfsd_file *nf; 215 316 216 317 nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL); 217 - if (nf) { 218 - INIT_LIST_HEAD(&nf->nf_lru); 219 - nf->nf_birthtime = ktime_get(); 220 - nf->nf_file = NULL; 221 - nf->nf_cred = get_current_cred(); 222 - nf->nf_net = key->net; 223 - nf->nf_flags = 0; 224 - __set_bit(NFSD_FILE_HASHED, &nf->nf_flags); 225 - __set_bit(NFSD_FILE_PENDING, &nf->nf_flags); 226 - if (key->gc) 227 - __set_bit(NFSD_FILE_GC, &nf->nf_flags); 228 - nf->nf_inode = key->inode; 229 - refcount_set(&nf->nf_ref, 1); 230 - nf->nf_may = key->need; 231 - nf->nf_mark = NULL; 232 - } 318 + if (unlikely(!nf)) 319 + return NULL; 320 + 321 + INIT_LIST_HEAD(&nf->nf_lru); 322 + nf->nf_birthtime = ktime_get(); 323 + nf->nf_file = NULL; 324 + nf->nf_cred = get_current_cred(); 325 + nf->nf_net = net; 326 + nf->nf_flags = want_gc ? 327 + BIT(NFSD_FILE_HASHED) | BIT(NFSD_FILE_PENDING) | BIT(NFSD_FILE_GC) : 328 + BIT(NFSD_FILE_HASHED) | BIT(NFSD_FILE_PENDING); 329 + nf->nf_inode = inode; 330 + refcount_set(&nf->nf_ref, 1); 331 + nf->nf_may = need; 332 + nf->nf_mark = NULL; 233 333 return nf; 234 334 } 235 335 ··· 254 354 nfsd_file_hash_remove(struct nfsd_file *nf) 255 355 { 256 356 trace_nfsd_file_unhash(nf); 257 - rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash, 258 - nfsd_file_rhash_params); 357 + rhltable_remove(&nfsd_file_rhltable, &nf->nf_rlist, 358 + nfsd_file_rhash_params); 259 359 } 260 360 261 361 static bool ··· 588 688 * @inode: inode on which to close out nfsd_files 589 689 * @dispose: list on which to gather nfsd_files to close out 590 690 * 591 - * An nfsd_file represents a struct file being held open on behalf of nfsd. An 592 - * open file however can block other activity (such as leases), or cause 691 + * An nfsd_file represents a struct file being held open on behalf of nfsd. 692 + * An open file however can block other activity (such as leases), or cause 593 693 * undesirable behavior (e.g. spurious silly-renames when reexporting NFS). 594 694 * 595 695 * This function is intended to find open nfsd_files when this sort of ··· 602 702 static void 603 703 nfsd_file_queue_for_close(struct inode *inode, struct list_head *dispose) 604 704 { 605 - struct nfsd_file_lookup_key key = { 606 - .type = NFSD_FILE_KEY_INODE, 607 - .inode = inode, 608 - .gc = true, 609 - }; 705 + struct rhlist_head *tmp, *list; 610 706 struct nfsd_file *nf; 611 707 612 708 rcu_read_lock(); 613 - do { 614 - nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key, 615 - nfsd_file_rhash_params); 616 - if (!nf) 617 - break; 709 + list = rhltable_lookup(&nfsd_file_rhltable, &inode, 710 + nfsd_file_rhash_params); 711 + rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) { 712 + if (!test_bit(NFSD_FILE_GC, &nf->nf_flags)) 713 + continue; 618 714 nfsd_file_cond_queue(nf, dispose); 619 - } while (1); 715 + } 620 716 rcu_read_unlock(); 621 717 } 622 718 ··· 736 840 if (test_and_set_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1) 737 841 return 0; 738 842 739 - ret = rhashtable_init(&nfsd_file_rhash_tbl, &nfsd_file_rhash_params); 843 + ret = rhltable_init(&nfsd_file_rhltable, &nfsd_file_rhash_params); 740 844 if (ret) 741 845 return ret; 742 846 ··· 804 908 nfsd_file_mark_slab = NULL; 805 909 destroy_workqueue(nfsd_filecache_wq); 806 910 nfsd_filecache_wq = NULL; 807 - rhashtable_destroy(&nfsd_file_rhash_tbl); 911 + rhltable_destroy(&nfsd_file_rhltable); 808 912 goto out; 809 913 } 810 914 ··· 823 927 struct nfsd_file *nf; 824 928 LIST_HEAD(dispose); 825 929 826 - rhashtable_walk_enter(&nfsd_file_rhash_tbl, &iter); 930 + rhltable_walk_enter(&nfsd_file_rhltable, &iter); 827 931 do { 828 932 rhashtable_walk_start(&iter); 829 933 ··· 929 1033 nfsd_file_mark_slab = NULL; 930 1034 destroy_workqueue(nfsd_filecache_wq); 931 1035 nfsd_filecache_wq = NULL; 932 - rhashtable_destroy(&nfsd_file_rhash_tbl); 1036 + rhltable_destroy(&nfsd_file_rhltable); 933 1037 934 1038 for_each_possible_cpu(i) { 935 1039 per_cpu(nfsd_file_cache_hits, i) = 0; ··· 938 1042 per_cpu(nfsd_file_total_age, i) = 0; 939 1043 per_cpu(nfsd_file_evictions, i) = 0; 940 1044 } 1045 + } 1046 + 1047 + static struct nfsd_file * 1048 + nfsd_file_lookup_locked(const struct net *net, const struct cred *cred, 1049 + struct inode *inode, unsigned char need, 1050 + bool want_gc) 1051 + { 1052 + struct rhlist_head *tmp, *list; 1053 + struct nfsd_file *nf; 1054 + 1055 + list = rhltable_lookup(&nfsd_file_rhltable, &inode, 1056 + nfsd_file_rhash_params); 1057 + rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) { 1058 + if (nf->nf_may != need) 1059 + continue; 1060 + if (nf->nf_net != net) 1061 + continue; 1062 + if (!nfsd_match_cred(nf->nf_cred, cred)) 1063 + continue; 1064 + if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != want_gc) 1065 + continue; 1066 + if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) 1067 + continue; 1068 + 1069 + if (!nfsd_file_get(nf)) 1070 + continue; 1071 + return nf; 1072 + } 1073 + return NULL; 941 1074 } 942 1075 943 1076 /** ··· 983 1058 bool 984 1059 nfsd_file_is_cached(struct inode *inode) 985 1060 { 986 - struct nfsd_file_lookup_key key = { 987 - .type = NFSD_FILE_KEY_INODE, 988 - .inode = inode, 989 - .gc = true, 990 - }; 1061 + struct rhlist_head *tmp, *list; 1062 + struct nfsd_file *nf; 991 1063 bool ret = false; 992 1064 993 - if (rhashtable_lookup_fast(&nfsd_file_rhash_tbl, &key, 994 - nfsd_file_rhash_params) != NULL) 995 - ret = true; 1065 + rcu_read_lock(); 1066 + list = rhltable_lookup(&nfsd_file_rhltable, &inode, 1067 + nfsd_file_rhash_params); 1068 + rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) 1069 + if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) { 1070 + ret = true; 1071 + break; 1072 + } 1073 + rcu_read_unlock(); 1074 + 996 1075 trace_nfsd_file_is_cached(inode, (int)ret); 997 1076 return ret; 998 1077 } ··· 1006 1077 unsigned int may_flags, struct file *file, 1007 1078 struct nfsd_file **pnf, bool want_gc) 1008 1079 { 1009 - struct nfsd_file_lookup_key key = { 1010 - .type = NFSD_FILE_KEY_FULL, 1011 - .need = may_flags & NFSD_FILE_MAY_MASK, 1012 - .net = SVC_NET(rqstp), 1013 - .gc = want_gc, 1014 - }; 1080 + unsigned char need = may_flags & NFSD_FILE_MAY_MASK; 1081 + struct net *net = SVC_NET(rqstp); 1082 + struct nfsd_file *new, *nf; 1083 + const struct cred *cred; 1015 1084 bool open_retry = true; 1016 - struct nfsd_file *nf; 1085 + struct inode *inode; 1017 1086 __be32 status; 1018 1087 int ret; 1019 1088 ··· 1019 1092 may_flags|NFSD_MAY_OWNER_OVERRIDE); 1020 1093 if (status != nfs_ok) 1021 1094 return status; 1022 - key.inode = d_inode(fhp->fh_dentry); 1023 - key.cred = get_current_cred(); 1095 + inode = d_inode(fhp->fh_dentry); 1096 + cred = get_current_cred(); 1024 1097 1025 1098 retry: 1026 1099 rcu_read_lock(); 1027 - nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key, 1028 - nfsd_file_rhash_params); 1029 - nf = nfsd_file_get(nf); 1100 + nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc); 1030 1101 rcu_read_unlock(); 1031 1102 1032 1103 if (nf) { ··· 1038 1113 goto wait_for_construction; 1039 1114 } 1040 1115 1041 - nf = nfsd_file_alloc(&key, may_flags); 1042 - if (!nf) { 1116 + new = nfsd_file_alloc(net, inode, need, want_gc); 1117 + if (!new) { 1043 1118 status = nfserr_jukebox; 1044 1119 goto out; 1045 1120 } 1046 1121 1047 - ret = rhashtable_lookup_insert_key(&nfsd_file_rhash_tbl, 1048 - &key, &nf->nf_rhash, 1049 - nfsd_file_rhash_params); 1122 + rcu_read_lock(); 1123 + spin_lock(&inode->i_lock); 1124 + nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc); 1125 + if (unlikely(nf)) { 1126 + spin_unlock(&inode->i_lock); 1127 + rcu_read_unlock(); 1128 + nfsd_file_slab_free(&new->nf_rcu); 1129 + goto wait_for_construction; 1130 + } 1131 + nf = new; 1132 + ret = rhltable_insert(&nfsd_file_rhltable, &nf->nf_rlist, 1133 + nfsd_file_rhash_params); 1134 + spin_unlock(&inode->i_lock); 1135 + rcu_read_unlock(); 1050 1136 if (likely(ret == 0)) 1051 1137 goto open_file; 1052 1138 1053 1139 if (ret == -EEXIST) 1054 1140 goto retry; 1055 - trace_nfsd_file_insert_err(rqstp, key.inode, may_flags, ret); 1141 + trace_nfsd_file_insert_err(rqstp, inode, may_flags, ret); 1056 1142 status = nfserr_jukebox; 1057 1143 goto construction_err; 1058 1144 ··· 1072 1136 1073 1137 /* Did construction of this file fail? */ 1074 1138 if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { 1075 - trace_nfsd_file_cons_err(rqstp, key.inode, may_flags, nf); 1139 + trace_nfsd_file_cons_err(rqstp, inode, may_flags, nf); 1076 1140 if (!open_retry) { 1077 1141 status = nfserr_jukebox; 1078 1142 goto construction_err; ··· 1094 1158 nfsd_file_check_write_error(nf); 1095 1159 *pnf = nf; 1096 1160 } 1097 - put_cred(key.cred); 1098 - trace_nfsd_file_acquire(rqstp, key.inode, may_flags, nf, status); 1161 + put_cred(cred); 1162 + trace_nfsd_file_acquire(rqstp, inode, may_flags, nf, status); 1099 1163 return status; 1100 1164 1101 1165 open_file: 1102 1166 trace_nfsd_file_alloc(nf); 1103 - nf->nf_mark = nfsd_file_mark_find_or_create(nf, key.inode); 1167 + nf->nf_mark = nfsd_file_mark_find_or_create(nf, inode); 1104 1168 if (nf->nf_mark) { 1105 1169 if (file) { 1106 1170 get_file(file); ··· 1118 1182 * If construction failed, or we raced with a call to unlink() 1119 1183 * then unhash. 1120 1184 */ 1121 - if (status == nfs_ok && key.inode->i_nlink == 0) 1185 + if (status != nfs_ok || inode->i_nlink == 0) 1122 1186 status = nfserr_jukebox; 1123 1187 if (status != nfs_ok) 1124 1188 nfsd_file_unhash(nf); ··· 1145 1209 * seconds after the final nfsd_file_put() in case the caller 1146 1210 * wants to re-use it. 1147 1211 * 1148 - * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in 1149 - * network byte order is returned. 1212 + * Return values: 1213 + * %nfs_ok - @pnf points to an nfsd_file with its reference 1214 + * count boosted. 1215 + * 1216 + * On error, an nfsstat value in network byte order is returned. 1150 1217 */ 1151 1218 __be32 1152 1219 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp, ··· 1169 1230 * but not garbage-collected. The object is unhashed after the 1170 1231 * final nfsd_file_put(). 1171 1232 * 1172 - * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in 1173 - * network byte order is returned. 1233 + * Return values: 1234 + * %nfs_ok - @pnf points to an nfsd_file with its reference 1235 + * count boosted. 1236 + * 1237 + * On error, an nfsstat value in network byte order is returned. 1174 1238 */ 1175 1239 __be32 1176 1240 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, ··· 1194 1252 * and @file is non-NULL, use it to instantiate a new nfsd_file instead of 1195 1253 * opening a new one. 1196 1254 * 1197 - * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in 1198 - * network byte order is returned. 1255 + * Return values: 1256 + * %nfs_ok - @pnf points to an nfsd_file with its reference 1257 + * count boosted. 1258 + * 1259 + * On error, an nfsstat value in network byte order is returned. 1199 1260 */ 1200 1261 __be32 1201 1262 nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp, ··· 1229 1284 lru = list_lru_count(&nfsd_file_lru); 1230 1285 1231 1286 rcu_read_lock(); 1232 - ht = &nfsd_file_rhash_tbl; 1287 + ht = &nfsd_file_rhltable.ht; 1233 1288 count = atomic_read(&ht->nelems); 1234 1289 tbl = rht_dereference_rcu(ht->tbl, ht); 1235 1290 buckets = tbl->size; ··· 1245 1300 evictions += per_cpu(nfsd_file_evictions, i); 1246 1301 } 1247 1302 1248 - seq_printf(m, "total entries: %u\n", count); 1303 + seq_printf(m, "total inodes: %u\n", count); 1249 1304 seq_printf(m, "hash buckets: %u\n", buckets); 1250 1305 seq_printf(m, "lru entries: %lu\n", lru); 1251 1306 seq_printf(m, "cache hits: %lu\n", hits);
+5 -4
fs/nfsd/filecache.h
··· 29 29 * never be dereferenced, only used for comparison. 30 30 */ 31 31 struct nfsd_file { 32 - struct rhash_head nf_rhash; 33 - struct list_head nf_lru; 34 - struct rcu_head nf_rcu; 32 + struct rhlist_head nf_rlist; 33 + void *nf_inode; 35 34 struct file *nf_file; 36 35 const struct cred *nf_cred; 37 36 struct net *nf_net; ··· 39 40 #define NFSD_FILE_REFERENCED (2) 40 41 #define NFSD_FILE_GC (3) 41 42 unsigned long nf_flags; 42 - struct inode *nf_inode; /* don't deref */ 43 43 refcount_t nf_ref; 44 44 unsigned char nf_may; 45 + 45 46 struct nfsd_file_mark *nf_mark; 47 + struct list_head nf_lru; 48 + struct rcu_head nf_rcu; 46 49 ktime_t nf_birthtime; 47 50 }; 48 51