Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ocfs2: Store dir index records inline

Allow us to store a small number of directory index records in the
ocfs2_dx_root_block. This saves us a disk read on small to medium sized
directories (less than about 250 entries). The inline root is automatically
turned into a root block with extents if the directory size increases beyond
it's capacity.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Acked-by: Joel Becker <joel.becker@oracle.com>

+474 -148
+437 -141
fs/ocfs2/dir.c
··· 151 151 152 152 void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res) 153 153 { 154 + brelse(res->dl_dx_root_bh); 154 155 brelse(res->dl_leaf_bh); 155 156 brelse(res->dl_dx_leaf_bh); 156 157 } ··· 161 160 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL) 162 161 return 1; 163 162 return 0; 163 + } 164 + 165 + static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root) 166 + { 167 + return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE; 164 168 } 165 169 166 170 /* ··· 805 799 * Returns the block index, from the start of the cluster which this 806 800 * hash belongs too. 807 801 */ 808 - static unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb, 802 + static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb, 803 + u32 minor_hash) 804 + { 805 + return minor_hash & osb->osb_dx_mask; 806 + } 807 + 808 + static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb, 809 809 struct ocfs2_dx_hinfo *hinfo) 810 810 { 811 - u32 minor_hash = hinfo->minor_hash; 812 - return minor_hash & osb->osb_dx_mask; 811 + return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash); 813 812 } 814 813 815 814 static int ocfs2_dx_dir_lookup(struct inode *inode, ··· 866 855 867 856 static int ocfs2_dx_dir_search(const char *name, int namelen, 868 857 struct inode *dir, 869 - struct ocfs2_extent_list *dr_el, 858 + struct ocfs2_dx_root_block *dx_root, 870 859 struct ocfs2_dir_lookup_result *res) 871 860 { 872 861 int ret, i, found; ··· 877 866 struct buffer_head *dir_ent_bh = NULL; 878 867 struct ocfs2_dir_entry *dir_ent = NULL; 879 868 struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo; 869 + struct ocfs2_extent_list *dr_el; 870 + struct ocfs2_dx_entry_list *entry_list; 880 871 881 872 ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo); 873 + 874 + if (ocfs2_dx_root_inline(dx_root)) { 875 + entry_list = &dx_root->dr_entries; 876 + goto search; 877 + } 878 + 879 + dr_el = &dx_root->dr_list; 882 880 883 881 ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys); 884 882 if (ret) { ··· 913 893 le16_to_cpu(dx_leaf->dl_list.de_num_used), 914 894 le16_to_cpu(dx_leaf->dl_list.de_count)); 915 895 896 + entry_list = &dx_leaf->dl_list; 897 + 898 + search: 916 899 /* 917 900 * Empty leaf is legal, so no need to check for that. 918 901 */ 919 902 found = 0; 920 - for (i = 0; i < le16_to_cpu(dx_leaf->dl_list.de_num_used); i++) { 921 - dx_entry = &dx_leaf->dl_list.de_entries[i]; 903 + for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) { 904 + dx_entry = &entry_list->de_entries[i]; 922 905 923 906 if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash) 924 907 || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash)) ··· 1005 982 } 1006 983 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; 1007 984 1008 - ret = ocfs2_dx_dir_search(name, namelen, dir, &dx_root->dr_list, 1009 - lookup); 985 + ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup); 1010 986 if (ret) { 1011 987 if (ret != -ENOENT) 1012 988 mlog_errno(ret); 1013 989 goto out; 1014 990 } 1015 991 992 + lookup->dl_dx_root_bh = dx_root_bh; 993 + dx_root_bh = NULL; 1016 994 out: 1017 995 brelse(di_bh); 1018 996 brelse(dx_root_bh); ··· 1150 1126 return status; 1151 1127 } 1152 1128 1153 - static void ocfs2_dx_leaf_remove_entry(struct ocfs2_dx_leaf *dx_leaf, int index) 1129 + static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list, 1130 + int index) 1154 1131 { 1155 - struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list; 1156 - int num_used = le16_to_cpu(dl_list->de_num_used); 1132 + int num_used = le16_to_cpu(entry_list->de_num_used); 1157 1133 1158 1134 if (num_used == 1 || index == (num_used - 1)) 1159 1135 goto clear; 1160 1136 1161 - memmove(&dl_list->de_entries[index], &dl_list->de_entries[index + 1], 1137 + memmove(&entry_list->de_entries[index], 1138 + &entry_list->de_entries[index + 1], 1162 1139 (num_used - index - 1)*sizeof(struct ocfs2_dx_entry)); 1163 1140 clear: 1164 1141 num_used--; 1165 - memset(&dl_list->de_entries[num_used], 0, 1142 + memset(&entry_list->de_entries[num_used], 0, 1166 1143 sizeof(struct ocfs2_dx_entry)); 1167 - dl_list->de_num_used = cpu_to_le16(num_used); 1144 + entry_list->de_num_used = cpu_to_le16(num_used); 1168 1145 } 1169 1146 1170 1147 static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir, 1171 1148 struct ocfs2_dir_lookup_result *lookup) 1172 1149 { 1173 1150 int ret, index; 1151 + struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; 1174 1152 struct buffer_head *leaf_bh = lookup->dl_leaf_bh; 1175 1153 struct ocfs2_dx_leaf *dx_leaf; 1176 1154 struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry; 1155 + struct ocfs2_dx_root_block *dx_root; 1156 + struct ocfs2_dx_entry_list *entry_list; 1177 1157 1178 - dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data; 1158 + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 1159 + if (ocfs2_dx_root_inline(dx_root)) { 1160 + entry_list = &dx_root->dr_entries; 1161 + } else { 1162 + dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data; 1163 + entry_list = &dx_leaf->dl_list; 1164 + } 1165 + 1179 1166 /* Neither of these are a disk corruption - that should have 1180 1167 * been caught by lookup, before we got here. */ 1181 - BUG_ON(le16_to_cpu(dx_leaf->dl_list.de_count) <= 0); 1182 - BUG_ON(le16_to_cpu(dx_leaf->dl_list.de_num_used) <= 0); 1168 + BUG_ON(le16_to_cpu(entry_list->de_count) <= 0); 1169 + BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0); 1183 1170 1184 - index = (char *)dx_entry - (char *)dx_leaf->dl_list.de_entries; 1171 + index = (char *)dx_entry - (char *)entry_list->de_entries; 1185 1172 index /= sizeof(*dx_entry); 1186 1173 1187 - if (index >= le16_to_cpu(dx_leaf->dl_list.de_num_used)) { 1174 + if (index >= le16_to_cpu(entry_list->de_num_used)) { 1188 1175 mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n", 1189 - (unsigned long long)OCFS2_I(dir)->ip_blkno, index, dx_leaf, 1190 - dx_entry); 1176 + (unsigned long long)OCFS2_I(dir)->ip_blkno, index, 1177 + entry_list, dx_entry); 1191 1178 return -EIO; 1179 + } 1180 + 1181 + /* 1182 + * Add the block holding our index into the journal before 1183 + * removing the unindexed entry. If we get an error return 1184 + * from __ocfs2_delete_entry(), then it hasn't removed the 1185 + * entry yet. Likewise, successful return means we *must* 1186 + * remove the indexed entry. 1187 + * 1188 + * We're also careful to journal the root tree block here if 1189 + * we're going to be adding to the start of the free list. 1190 + */ 1191 + if (ocfs2_dx_root_inline(dx_root)) { 1192 + ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, 1193 + OCFS2_JOURNAL_ACCESS_WRITE); 1194 + if (ret) { 1195 + mlog_errno(ret); 1196 + goto out; 1197 + } 1198 + } else { 1199 + ret = ocfs2_journal_access_dl(handle, dir, 1200 + lookup->dl_dx_leaf_bh, 1201 + OCFS2_JOURNAL_ACCESS_WRITE); 1202 + if (ret) { 1203 + mlog_errno(ret); 1204 + goto out; 1205 + } 1192 1206 } 1193 1207 1194 1208 mlog(0, "Dir %llu: delete entry at index: %d\n", 1195 1209 (unsigned long long)OCFS2_I(dir)->ip_blkno, index); 1196 - 1197 - /* 1198 - * Add the index leaf into the journal before removing the 1199 - * unindexed entry. If we get an error return from 1200 - * __ocfs2_delete_entry(), then it hasn't removed the entry 1201 - * yet. Likewise, successful return means we *must* remove the 1202 - * indexed entry. 1203 - */ 1204 - ret = ocfs2_journal_access_dl(handle, dir, lookup->dl_dx_leaf_bh, 1205 - OCFS2_JOURNAL_ACCESS_WRITE); 1206 - if (ret) { 1207 - mlog_errno(ret); 1208 - goto out; 1209 - } 1210 1210 1211 1211 ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry, 1212 1212 leaf_bh, leaf_bh->b_data, leaf_bh->b_size); ··· 1239 1191 goto out; 1240 1192 } 1241 1193 1242 - ocfs2_dx_leaf_remove_entry(dx_leaf, index); 1194 + ocfs2_dx_list_remove_entry(entry_list, index); 1243 1195 1244 - ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh); 1196 + if (ocfs2_dx_root_inline(dx_root)) 1197 + ocfs2_journal_dirty(handle, dx_root_bh); 1198 + else 1199 + ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh); 1245 1200 1246 1201 out: 1247 1202 return ret; ··· 1341 1290 le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1); 1342 1291 } 1343 1292 1293 + static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list, 1294 + struct ocfs2_dx_hinfo *hinfo, 1295 + u64 dirent_blk) 1296 + { 1297 + int i; 1298 + struct ocfs2_dx_entry *dx_entry; 1299 + 1300 + i = le16_to_cpu(entry_list->de_num_used); 1301 + dx_entry = &entry_list->de_entries[i]; 1302 + 1303 + memset(dx_entry, 0, sizeof(*dx_entry)); 1304 + dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash); 1305 + dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash); 1306 + dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk); 1307 + 1308 + le16_add_cpu(&entry_list->de_num_used, 1); 1309 + } 1310 + 1344 1311 static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle, 1345 1312 struct ocfs2_dx_hinfo *hinfo, 1346 1313 u64 dirent_blk, 1347 1314 struct buffer_head *dx_leaf_bh) 1348 1315 { 1349 - int ret, i; 1350 - struct ocfs2_dx_entry *dx_entry; 1316 + int ret; 1351 1317 struct ocfs2_dx_leaf *dx_leaf; 1352 1318 1353 1319 ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh, ··· 1375 1307 } 1376 1308 1377 1309 dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data; 1378 - i = le16_to_cpu(dx_leaf->dl_list.de_num_used); 1379 - dx_entry = &dx_leaf->dl_list.de_entries[i]; 1380 - 1381 - memset(dx_entry, 0, sizeof(*dx_entry)); 1382 - dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash); 1383 - dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash); 1384 - dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk); 1385 - 1386 - le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1); 1387 - 1310 + ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk); 1388 1311 ocfs2_journal_dirty(handle, dx_leaf_bh); 1389 1312 1390 1313 out: 1391 1314 return ret; 1392 1315 } 1393 1316 1394 - static int ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle, 1395 - struct ocfs2_dir_lookup_result *lookup) 1317 + static int ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle, 1318 + struct ocfs2_dx_hinfo *hinfo, 1319 + u64 dirent_blk, 1320 + struct buffer_head *dx_root_bh) 1396 1321 { 1322 + int ret; 1323 + struct ocfs2_dx_root_block *dx_root; 1324 + 1325 + ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, 1326 + OCFS2_JOURNAL_ACCESS_WRITE); 1327 + if (ret) { 1328 + mlog_errno(ret); 1329 + goto out; 1330 + } 1331 + 1332 + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 1333 + ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk); 1334 + ocfs2_journal_dirty(handle, dx_root_bh); 1335 + 1336 + out: 1337 + return ret; 1338 + } 1339 + 1340 + static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle, 1341 + struct ocfs2_dir_lookup_result *lookup) 1342 + { 1343 + struct ocfs2_dx_root_block *dx_root; 1344 + 1345 + dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data; 1346 + if (ocfs2_dx_root_inline(dx_root)) 1347 + return ocfs2_dx_inline_root_insert(dir, handle, 1348 + &lookup->dl_hinfo, 1349 + lookup->dl_leaf_bh->b_blocknr, 1350 + lookup->dl_dx_root_bh); 1351 + 1397 1352 return __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo, 1398 1353 lookup->dl_leaf_bh->b_blocknr, 1399 1354 lookup->dl_dx_leaf_bh); ··· 1500 1409 else { 1501 1410 status = ocfs2_journal_access_db(handle, dir, 1502 1411 insert_bh, 1503 - OCFS2_JOURNAL_ACCESS_WRITE); 1412 + OCFS2_JOURNAL_ACCESS_WRITE); 1413 + 1504 1414 if (ocfs2_dir_indexed(dir)) { 1505 - status = ocfs2_dx_dir_leaf_insert(dir, 1506 - handle, 1507 - lookup); 1415 + status = ocfs2_dx_dir_insert(dir, 1416 + handle, 1417 + lookup); 1508 1418 if (status) { 1509 1419 mlog_errno(status); 1510 1420 goto bail; ··· 2111 2019 handle_t *handle, struct inode *dir, 2112 2020 struct buffer_head *di_bh, 2113 2021 struct ocfs2_alloc_context *meta_ac, 2022 + int dx_inline, 2114 2023 struct buffer_head **ret_dx_root_bh) 2115 2024 { 2116 2025 int ret; ··· 2155 2062 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation); 2156 2063 dx_root->dr_blkno = cpu_to_le64(dr_blkno); 2157 2064 dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno); 2158 - dx_root->dr_list.l_count = 2159 - cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); 2065 + 2066 + if (dx_inline) { 2067 + dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE; 2068 + dx_root->dr_entries.de_count = 2069 + cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb)); 2070 + } else { 2071 + dx_root->dr_list.l_count = 2072 + cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); 2073 + } 2160 2074 2161 2075 ret = ocfs2_journal_dirty(handle, dx_root_bh); 2162 2076 if (ret) ··· 2336 2236 struct ocfs2_alloc_context *data_ac, 2337 2237 struct ocfs2_alloc_context *meta_ac) 2338 2238 { 2339 - int ret, num_dx_leaves, i; 2239 + int ret; 2340 2240 struct buffer_head *leaf_bh = NULL; 2341 2241 struct buffer_head *dx_root_bh = NULL; 2342 - struct buffer_head **dx_leaves = NULL; 2343 - struct ocfs2_extent_tree et; 2344 2242 struct ocfs2_dx_hinfo hinfo; 2345 - u64 insert_blkno; 2346 - 2347 - dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves); 2348 - if (!dx_leaves) { 2349 - ret = -ENOMEM; 2350 - mlog_errno(ret); 2351 - goto out; 2352 - } 2243 + struct ocfs2_dx_root_block *dx_root; 2244 + struct ocfs2_dx_entry_list *entry_list; 2353 2245 2354 2246 /* 2355 2247 * Our strategy is to create the directory as though it were ··· 2350 2258 * very well known quantity. 2351 2259 * 2352 2260 * Essentially, we have two dirents ("." and ".."), in the 1st 2353 - * block which need indexing. 2261 + * block which need indexing. These are easily inserted into 2262 + * the index block. 2354 2263 */ 2355 2264 2356 2265 ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh, ··· 2361 2268 goto out; 2362 2269 } 2363 2270 2364 - /* 2365 - * Allocate and format the index leaf first, before attaching 2366 - * the index root. That way we're sure that the main bitmap 2367 - * won't -enospc on us with a half-created dir index. 2368 - * 2369 - * The meta data allocation for our index block will not 2370 - * -enospc on us unless there is a disk corruption. 2371 - */ 2372 - 2373 - ret = __ocfs2_dx_dir_new_cluster(inode, 0, handle, data_ac, dx_leaves, 2374 - num_dx_leaves, &insert_blkno); 2271 + ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, 2272 + meta_ac, 1, &dx_root_bh); 2375 2273 if (ret) { 2376 2274 mlog_errno(ret); 2377 2275 goto out; 2378 2276 } 2277 + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 2278 + entry_list = &dx_root->dr_entries; 2379 2279 2380 - ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo); 2381 - i = ocfs2_dx_dir_hash_idx(osb, &hinfo); 2382 - ret = __ocfs2_dx_dir_leaf_insert(inode, handle, &hinfo, 2383 - leaf_bh->b_blocknr, dx_leaves[i]); 2384 - if (ret) { 2385 - mlog_errno(ret); 2386 - goto out; 2387 - } 2280 + /* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */ 2281 + ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr); 2388 2282 2389 2283 ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo); 2390 - i = ocfs2_dx_dir_hash_idx(osb, &hinfo); 2391 - ret = __ocfs2_dx_dir_leaf_insert(inode, handle, &hinfo, 2392 - leaf_bh->b_blocknr, dx_leaves[i]); 2393 - if (ret) { 2394 - mlog_errno(ret); 2395 - goto out; 2396 - } 2397 - 2398 - ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, meta_ac, 2399 - &dx_root_bh); 2400 - if (ret) { 2401 - mlog_errno(ret); 2402 - goto out; 2403 - } 2404 - 2405 - /* This should never fail considering we start with an empty 2406 - * dx_root. */ 2407 - ocfs2_init_dx_root_extent_tree(&et, inode, dx_root_bh); 2408 - ret = ocfs2_insert_extent(osb, handle, inode, &et, 0, 2409 - insert_blkno, 1, 0, NULL); 2410 - if (ret) 2411 - mlog_errno(ret); 2284 + ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr); 2412 2285 2413 2286 out: 2414 - if (dx_leaves) { 2415 - for (i = 0; i < num_dx_leaves; i++) 2416 - brelse(dx_leaves[i]); 2417 - kfree(dx_leaves); 2418 - } 2419 2287 brelse(dx_root_bh); 2420 2288 brelse(leaf_bh); 2421 2289 return ret; ··· 2446 2392 out: 2447 2393 return ret; 2448 2394 } 2395 + /* 2396 + * XXX: This expects dx_root_bh to already be part of the transaction. 2397 + */ 2398 + static void ocfs2_dx_dir_index_root_block(struct inode *dir, 2399 + struct buffer_head *dx_root_bh, 2400 + struct buffer_head *dirent_bh) 2401 + { 2402 + char *de_buf, *limit; 2403 + struct ocfs2_dx_root_block *dx_root; 2404 + struct ocfs2_dir_entry *de; 2405 + struct ocfs2_dx_hinfo hinfo; 2406 + u64 dirent_blk = dirent_bh->b_blocknr; 2407 + 2408 + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 2409 + 2410 + de_buf = dirent_bh->b_data; 2411 + limit = de_buf + dir->i_sb->s_blocksize; 2412 + 2413 + while (de_buf < limit) { 2414 + de = (struct ocfs2_dir_entry *)de_buf; 2415 + 2416 + if (!de->name_len || !de->inode) 2417 + goto inc; 2418 + 2419 + ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo); 2420 + 2421 + mlog(0, 2422 + "dir: %llu, major: 0x%x minor: 0x%x, index: %u, name: %.*s\n", 2423 + (unsigned long long)dir->i_ino, hinfo.major_hash, 2424 + hinfo.minor_hash, 2425 + le16_to_cpu(dx_root->dr_entries.de_num_used), 2426 + de->name_len, de->name); 2427 + 2428 + ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo, 2429 + dirent_blk); 2430 + inc: 2431 + de_buf += le16_to_cpu(de->rec_len); 2432 + } 2433 + } 2434 + 2435 + /* 2436 + * Count the number of inline directory entries in di_bh and compare 2437 + * them against the number of entries we can hold in an inline dx root 2438 + * block. 2439 + */ 2440 + static int ocfs2_new_dx_should_be_inline(struct inode *dir, 2441 + struct buffer_head *di_bh) 2442 + { 2443 + int dirent_count = 0; 2444 + char *de_buf, *limit; 2445 + struct ocfs2_dir_entry *de; 2446 + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 2447 + 2448 + de_buf = di->id2.i_data.id_data; 2449 + limit = de_buf + i_size_read(dir); 2450 + 2451 + while (de_buf < limit) { 2452 + de = (struct ocfs2_dir_entry *)de_buf; 2453 + 2454 + if (de->name_len && de->inode) 2455 + dirent_count++; 2456 + 2457 + de_buf += le16_to_cpu(de->rec_len); 2458 + } 2459 + 2460 + /* We are careful to leave room for one extra record. */ 2461 + return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb); 2462 + } 2449 2463 2450 2464 /* 2451 2465 * Expand rec_len of the rightmost dirent in a directory block so that it ··· 2564 2442 { 2565 2443 u32 alloc, dx_alloc, bit_off, len; 2566 2444 struct super_block *sb = dir->i_sb; 2567 - int ret, i, num_dx_leaves = 0, 2445 + int ret, i, num_dx_leaves = 0, dx_inline = 0, 2568 2446 credits = ocfs2_inline_to_extents_credits(sb); 2569 2447 u64 dx_insert_blkno, blkno, 2570 2448 bytes = blocks_wanted << sb->s_blocksize_bits; ··· 2587 2465 dx_alloc = 0; 2588 2466 2589 2467 if (ocfs2_supports_indexed_dirs(osb)) { 2590 - /* Add one more cluster for an index leaf */ 2591 - dx_alloc++; 2592 2468 credits += ocfs2_add_dir_index_credits(sb); 2593 2469 2594 - dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb, &num_dx_leaves); 2595 - if (!dx_leaves) { 2596 - ret = -ENOMEM; 2597 - mlog_errno(ret); 2598 - goto out; 2470 + dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh); 2471 + if (!dx_inline) { 2472 + /* Add one more cluster for an index leaf */ 2473 + dx_alloc++; 2474 + dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb, 2475 + &num_dx_leaves); 2476 + if (!dx_leaves) { 2477 + ret = -ENOMEM; 2478 + mlog_errno(ret); 2479 + goto out; 2480 + } 2599 2481 } 2600 2482 2601 2483 /* This gets us the dx_root */ ··· 2650 2524 } 2651 2525 did_quota = 1; 2652 2526 2653 - if (ocfs2_supports_indexed_dirs(osb)) { 2527 + if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { 2654 2528 /* 2655 2529 * Allocate our index cluster first, to maximize the 2656 2530 * possibility that unindexed leaves grow ··· 2713 2587 goto out_commit; 2714 2588 } 2715 2589 2716 - if (ocfs2_supports_indexed_dirs(osb)) { 2590 + if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { 2591 + /* 2592 + * Dx dirs with an external cluster need to do this up 2593 + * front. Inline dx root's get handled later, after 2594 + * we've allocated our root block. 2595 + */ 2717 2596 ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves, 2718 2597 num_dx_leaves, dirdata_bh); 2719 2598 if (ret) { ··· 2781 2650 2782 2651 if (ocfs2_supports_indexed_dirs(osb)) { 2783 2652 ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh, 2784 - meta_ac, &dx_root_bh); 2653 + meta_ac, dx_inline, 2654 + &dx_root_bh); 2785 2655 if (ret) { 2786 2656 mlog_errno(ret); 2787 2657 goto out_commit; 2788 2658 } 2789 2659 2790 - ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh); 2791 - ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0, 2792 - dx_insert_blkno, 1, 0, NULL); 2793 - if (ret) 2794 - mlog_errno(ret); 2660 + if (dx_inline) { 2661 + ocfs2_dx_dir_index_root_block(dir, dx_root_bh, 2662 + dirdata_bh); 2663 + } else { 2664 + ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh); 2665 + ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0, 2666 + dx_insert_blkno, 1, 0, NULL); 2667 + if (ret) 2668 + mlog_errno(ret); 2669 + } 2795 2670 } 2796 2671 2797 2672 /* ··· 2827 2690 if (ocfs2_supports_indexed_dirs(osb)) { 2828 2691 unsigned int off; 2829 2692 2830 - /* 2831 - * We need to return the correct block within the 2832 - * cluster which should hold our entry. 2833 - */ 2834 - off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), 2835 - &lookup->dl_hinfo); 2836 - get_bh(dx_leaves[off]); 2837 - lookup->dl_dx_leaf_bh = dx_leaves[off]; 2693 + if (!dx_inline) { 2694 + /* 2695 + * We need to return the correct block within the 2696 + * cluster which should hold our entry. 2697 + */ 2698 + off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), 2699 + &lookup->dl_hinfo); 2700 + get_bh(dx_leaves[off]); 2701 + lookup->dl_dx_leaf_bh = dx_leaves[off]; 2702 + } 2703 + lookup->dl_dx_root_bh = dx_root_bh; 2704 + dx_root_bh = NULL; 2838 2705 } 2839 2706 2840 2707 out_commit: ··· 3647 3506 return ret; 3648 3507 } 3649 3508 3509 + static int ocfs2_expand_inline_dx_root(struct inode *dir, 3510 + struct buffer_head *dx_root_bh) 3511 + { 3512 + int ret, num_dx_leaves, i, j, did_quota = 0; 3513 + struct buffer_head **dx_leaves = NULL; 3514 + struct ocfs2_extent_tree et; 3515 + u64 insert_blkno; 3516 + struct ocfs2_alloc_context *data_ac = NULL; 3517 + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 3518 + handle_t *handle = NULL; 3519 + struct ocfs2_dx_root_block *dx_root; 3520 + struct ocfs2_dx_entry_list *entry_list; 3521 + struct ocfs2_dx_entry *dx_entry; 3522 + struct ocfs2_dx_leaf *target_leaf; 3523 + 3524 + ret = ocfs2_reserve_clusters(osb, 1, &data_ac); 3525 + if (ret) { 3526 + mlog_errno(ret); 3527 + goto out; 3528 + } 3529 + 3530 + dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves); 3531 + if (!dx_leaves) { 3532 + ret = -ENOMEM; 3533 + mlog_errno(ret); 3534 + goto out; 3535 + } 3536 + 3537 + handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb)); 3538 + if (IS_ERR(handle)) { 3539 + ret = PTR_ERR(handle); 3540 + mlog_errno(ret); 3541 + goto out; 3542 + } 3543 + 3544 + if (vfs_dq_alloc_space_nodirty(dir, 3545 + ocfs2_clusters_to_bytes(osb->sb, 1))) { 3546 + ret = -EDQUOT; 3547 + goto out_commit; 3548 + } 3549 + did_quota = 1; 3550 + 3551 + /* 3552 + * We do this up front, before the allocation, so that a 3553 + * failure to add the dx_root_bh to the journal won't result 3554 + * us losing clusters. 3555 + */ 3556 + ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, 3557 + OCFS2_JOURNAL_ACCESS_WRITE); 3558 + if (ret) { 3559 + mlog_errno(ret); 3560 + goto out_commit; 3561 + } 3562 + 3563 + ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves, 3564 + num_dx_leaves, &insert_blkno); 3565 + if (ret) { 3566 + mlog_errno(ret); 3567 + goto out_commit; 3568 + } 3569 + 3570 + /* 3571 + * Transfer the entries from our dx_root into the appropriate 3572 + * block 3573 + */ 3574 + dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; 3575 + entry_list = &dx_root->dr_entries; 3576 + 3577 + for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) { 3578 + dx_entry = &entry_list->de_entries[i]; 3579 + 3580 + j = __ocfs2_dx_dir_hash_idx(osb, 3581 + le32_to_cpu(dx_entry->dx_minor_hash)); 3582 + target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data; 3583 + 3584 + ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry); 3585 + 3586 + /* Each leaf has been passed to the journal already 3587 + * via __ocfs2_dx_dir_new_cluster() */ 3588 + } 3589 + 3590 + dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE; 3591 + memset(&dx_root->dr_list, 0, osb->sb->s_blocksize - 3592 + offsetof(struct ocfs2_dx_root_block, dr_list)); 3593 + dx_root->dr_list.l_count = 3594 + cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); 3595 + 3596 + /* This should never fail considering we start with an empty 3597 + * dx_root. */ 3598 + ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); 3599 + ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, 3600 + insert_blkno, 1, 0, NULL); 3601 + if (ret) 3602 + mlog_errno(ret); 3603 + did_quota = 0; 3604 + 3605 + ocfs2_journal_dirty(handle, dx_root_bh); 3606 + 3607 + out_commit: 3608 + if (ret < 0 && did_quota) 3609 + vfs_dq_free_space_nodirty(dir, 3610 + ocfs2_clusters_to_bytes(dir->i_sb, 1)); 3611 + 3612 + ocfs2_commit_trans(osb, handle); 3613 + 3614 + out: 3615 + if (data_ac) 3616 + ocfs2_free_alloc_context(data_ac); 3617 + 3618 + if (dx_leaves) { 3619 + for (i = 0; i < num_dx_leaves; i++) 3620 + brelse(dx_leaves[i]); 3621 + kfree(dx_leaves); 3622 + } 3623 + return ret; 3624 + } 3625 + 3626 + static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh) 3627 + { 3628 + struct ocfs2_dx_root_block *dx_root; 3629 + struct ocfs2_dx_entry_list *entry_list; 3630 + 3631 + dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; 3632 + entry_list = &dx_root->dr_entries; 3633 + 3634 + if (le16_to_cpu(entry_list->de_num_used) >= 3635 + le16_to_cpu(entry_list->de_count)) 3636 + return -ENOSPC; 3637 + 3638 + return 0; 3639 + } 3640 + 3650 3641 static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir, 3651 3642 struct buffer_head *di_bh, const char *name, 3652 3643 int namelen, ··· 3800 3527 } 3801 3528 3802 3529 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 3530 + if (ocfs2_dx_root_inline(dx_root)) { 3531 + ret = ocfs2_inline_dx_has_space(dx_root_bh); 3532 + 3533 + if (ret == 0) 3534 + goto search_el; 3535 + 3536 + /* 3537 + * We ran out of room in the root block. Expand it to 3538 + * an extent, then allow ocfs2_find_dir_space_dx to do 3539 + * the rest. 3540 + */ 3541 + ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh); 3542 + if (ret) { 3543 + mlog_errno(ret); 3544 + goto out; 3545 + } 3546 + } 3803 3547 3804 3548 restart_search: 3805 3549 ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo, ··· 3868 3578 goto restart_search; 3869 3579 } 3870 3580 3581 + search_el: 3871 3582 lookup->dl_dx_leaf_bh = dx_leaf_bh; 3872 3583 dx_leaf_bh = NULL; 3584 + lookup->dl_dx_root_bh = dx_root_bh; 3585 + dx_root_bh = NULL; 3873 3586 3874 3587 out: 3875 3588 brelse(dx_leaf_bh); ··· 4067 3774 mlog_errno(ret); 4068 3775 goto out; 4069 3776 } 3777 + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 3778 + 3779 + if (ocfs2_dx_root_inline(dx_root)) 3780 + goto remove_index; 4070 3781 4071 3782 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); 4072 - 4073 - dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 4074 3783 4075 3784 /* XXX: What if dr_clusters is too large? */ 4076 3785 while (le32_to_cpu(dx_root->dr_clusters)) { ··· 4098 3803 major_hash = cpos - 1; 4099 3804 } 4100 3805 3806 + remove_index: 4101 3807 ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh); 4102 3808 if (ret) { 4103 3809 mlog_errno(ret);
+2
fs/ocfs2/dir.h
··· 37 37 struct ocfs2_dir_entry *dl_entry; /* Target dirent in 38 38 * unindexed leaf */ 39 39 40 + struct buffer_head *dl_dx_root_bh; /* Root of indexed 41 + * tree */ 40 42 struct buffer_head *dl_dx_leaf_bh; /* Indexed leaf block */ 41 43 struct ocfs2_dx_entry *dl_dx_entry; /* Target dx_entry in 42 44 * indexed leaf */
+10
fs/ocfs2/journal.h
··· 458 458 #define OCFS2_DX_ROOT_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \ 459 459 OCFS2_SUBALLOC_FREE) 460 460 461 + static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb) 462 + { 463 + int credits = 1 + OCFS2_SUBALLOC_ALLOC; 464 + 465 + credits += ocfs2_clusters_to_blocks(sb, 1); 466 + credits += ocfs2_quota_trans_credits(sb); 467 + 468 + return credits; 469 + } 470 + 461 471 /* 462 472 * Please note that the caller must make sure that root_el is the root 463 473 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+1 -3
fs/ocfs2/namei.c
··· 321 321 want_clusters += 1; 322 322 323 323 /* Dir indexing requires extra space as well */ 324 - if (ocfs2_supports_indexed_dirs(osb)) { 325 - want_clusters++; 324 + if (ocfs2_supports_indexed_dirs(osb)) 326 325 want_meta++; 327 - } 328 326 } 329 327 330 328 status = ocfs2_reserve_new_metadata_blocks(osb, want_meta, &meta_ac);
+24 -4
fs/ocfs2/ocfs2_fs.h
··· 815 815 * length de_num_used */ 816 816 }; 817 817 818 + #define OCFS2_DX_FLAG_INLINE 0x01 819 + 818 820 /* 819 821 * A directory indexing block. Each indexed directory has one of these, 820 822 * pointed to by ocfs2_dinode. ··· 837 835 * extent block */ 838 836 __le32 dr_clusters; /* Clusters allocated 839 837 * to the indexed tree. */ 840 - __le32 dr_reserved1; 838 + __u8 dr_flags; /* OCFS2_DX_FLAG_* flags */ 839 + __u8 dr_reserved0; 840 + __le16 dr_reserved1; 841 841 __le64 dr_dir_blkno; /* Pointer to parent inode */ 842 842 __le64 dr_reserved2; 843 843 __le64 dr_reserved3[16]; 844 - struct ocfs2_extent_list dr_list; /* Keep this aligned to 128 845 - * bits for maximum space 846 - * efficiency. */ 844 + union { 845 + struct ocfs2_extent_list dr_list; /* Keep this aligned to 128 846 + * bits for maximum space 847 + * efficiency. */ 848 + struct ocfs2_dx_entry_list dr_entries; /* In-root-block list of 849 + * entries. We grow out 850 + * to extents if this 851 + * gets too big. */ 852 + }; 847 853 }; 848 854 849 855 /* ··· 1234 1224 1235 1225 size = sb->s_blocksize - 1236 1226 offsetof(struct ocfs2_dx_leaf, dl_list.de_entries); 1227 + 1228 + return size / sizeof(struct ocfs2_dx_entry); 1229 + } 1230 + 1231 + static inline int ocfs2_dx_entries_per_root(struct super_block *sb) 1232 + { 1233 + int size; 1234 + 1235 + size = sb->s_blocksize - 1236 + offsetof(struct ocfs2_dx_root_block, dr_entries.de_entries); 1237 1237 1238 1238 return size / sizeof(struct ocfs2_dx_entry); 1239 1239 }