Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ocfs2: Introduce dir free space list

The only operation which doesn't get faster with directory indexing is
insert, which still has to walk the entire unindexed directory portion to
find a free block. This patch provides an improvement in directory insert
performance by maintaining a singly linked list of directory leaf blocks
which have space for additional dirents.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Acked-by: Joel Becker <joel.becker@oracle.com>

+490 -93
+472 -86
fs/ocfs2/dir.c
··· 80 80 struct ocfs2_alloc_context *data_ac, 81 81 struct ocfs2_alloc_context *meta_ac, 82 82 struct buffer_head **new_bh); 83 + static int ocfs2_dir_indexed(struct inode *inode); 83 84 84 85 /* 85 86 * These are distinct checks because future versions of the file system will 86 87 * want to have a trailing dirent structure independent of indexing. 87 88 */ 88 - static int ocfs2_dir_has_trailer(struct inode *dir) 89 + static int ocfs2_supports_dir_trailer(struct inode *dir) 89 90 { 91 + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 92 + 90 93 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 91 94 return 0; 92 95 93 - return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb)); 96 + return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir); 94 97 } 95 98 96 - static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb) 99 + /* 100 + * "new' here refers to the point at which we're creating a new 101 + * directory via "mkdir()", but also when we're expanding an inline 102 + * directory. In either case, we don't yet have the indexing bit set 103 + * on the directory, so the standard checks will fail in when metaecc 104 + * is turned off. Only directory-initialization type functions should 105 + * use this then. Everything else wants ocfs2_supports_dir_trailer() 106 + */ 107 + static int ocfs2_new_dir_wants_trailer(struct inode *dir) 97 108 { 98 - return ocfs2_meta_ecc(osb); 109 + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 110 + 111 + return ocfs2_meta_ecc(osb) || 112 + ocfs2_supports_indexed_dirs(osb); 99 113 } 100 114 101 115 static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb) ··· 141 127 { 142 128 unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer); 143 129 144 - if (!ocfs2_dir_has_trailer(dir)) 130 + if (!ocfs2_supports_dir_trailer(dir)) 145 131 return 0; 146 132 147 133 if (offset != toff) ··· 151 137 } 152 138 153 139 static void ocfs2_init_dir_trailer(struct inode *inode, 154 - struct buffer_head *bh) 140 + struct buffer_head *bh, u16 rec_len) 155 141 { 156 142 struct ocfs2_dir_block_trailer *trailer; 157 143 ··· 161 147 cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer)); 162 148 trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); 163 149 trailer->db_blkno = cpu_to_le64(bh->b_blocknr); 150 + trailer->db_free_rec_len = cpu_to_le16(rec_len); 151 + } 152 + /* 153 + * Link an unindexed block with a dir trailer structure into the index free 154 + * list. This function will modify dirdata_bh, but assumes you've already 155 + * passed it to the journal. 156 + */ 157 + static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle, 158 + struct buffer_head *dx_root_bh, 159 + struct buffer_head *dirdata_bh) 160 + { 161 + int ret; 162 + struct ocfs2_dx_root_block *dx_root; 163 + struct ocfs2_dir_block_trailer *trailer; 164 + 165 + ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, 166 + OCFS2_JOURNAL_ACCESS_WRITE); 167 + if (ret) { 168 + mlog_errno(ret); 169 + goto out; 170 + } 171 + trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb); 172 + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 173 + 174 + trailer->db_free_next = dx_root->dr_free_blk; 175 + dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr); 176 + 177 + ocfs2_journal_dirty(handle, dx_root_bh); 178 + 179 + out: 180 + return ret; 181 + } 182 + 183 + static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res) 184 + { 185 + return res->dl_prev_leaf_bh == NULL; 164 186 } 165 187 166 188 void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res) ··· 204 154 brelse(res->dl_dx_root_bh); 205 155 brelse(res->dl_leaf_bh); 206 156 brelse(res->dl_dx_leaf_bh); 157 + brelse(res->dl_prev_leaf_bh); 207 158 } 208 159 209 160 static int ocfs2_dir_indexed(struct inode *inode) ··· 535 484 } 536 485 537 486 if (!(flags & OCFS2_BH_READAHEAD) && 538 - ocfs2_dir_has_trailer(inode)) { 487 + ocfs2_supports_dir_trailer(inode)) { 539 488 rc = ocfs2_check_dir_trailer(inode, tmp); 540 489 if (rc) { 541 490 if (!*bh) ··· 1201 1150 return status; 1202 1151 } 1203 1152 1153 + static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de) 1154 + { 1155 + unsigned int hole; 1156 + 1157 + if (le64_to_cpu(de->inode) == 0) 1158 + hole = le16_to_cpu(de->rec_len); 1159 + else 1160 + hole = le16_to_cpu(de->rec_len) - 1161 + OCFS2_DIR_REC_LEN(de->name_len); 1162 + 1163 + return hole; 1164 + } 1165 + 1166 + static int ocfs2_find_max_rec_len(struct super_block *sb, 1167 + struct buffer_head *dirblock_bh) 1168 + { 1169 + int size, this_hole, largest_hole = 0; 1170 + char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data; 1171 + struct ocfs2_dir_entry *de; 1172 + 1173 + trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb); 1174 + size = ocfs2_dir_trailer_blk_off(sb); 1175 + limit = start + size; 1176 + de_buf = start; 1177 + de = (struct ocfs2_dir_entry *)de_buf; 1178 + do { 1179 + if (de_buf != trailer) { 1180 + this_hole = ocfs2_figure_dirent_hole(de); 1181 + if (this_hole > largest_hole) 1182 + largest_hole = this_hole; 1183 + } 1184 + 1185 + de_buf += le16_to_cpu(de->rec_len); 1186 + de = (struct ocfs2_dir_entry *)de_buf; 1187 + } while (de_buf < limit); 1188 + 1189 + if (largest_hole >= OCFS2_DIR_MIN_REC_LEN) 1190 + return largest_hole; 1191 + return 0; 1192 + } 1193 + 1204 1194 static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list, 1205 1195 int index) 1206 1196 { ··· 1263 1171 static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir, 1264 1172 struct ocfs2_dir_lookup_result *lookup) 1265 1173 { 1266 - int ret, index; 1174 + int ret, index, max_rec_len, add_to_free_list = 0; 1267 1175 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; 1268 1176 struct buffer_head *leaf_bh = lookup->dl_leaf_bh; 1269 1177 struct ocfs2_dx_leaf *dx_leaf; 1270 1178 struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry; 1179 + struct ocfs2_dir_block_trailer *trailer; 1271 1180 struct ocfs2_dx_root_block *dx_root; 1272 1181 struct ocfs2_dx_entry_list *entry_list; 1273 1182 1183 + /* 1184 + * This function gets a bit messy because we might have to 1185 + * modify the root block, regardless of whether the indexed 1186 + * entries are stored inline. 1187 + */ 1188 + 1189 + /* 1190 + * *Only* set 'entry_list' here, based on where we're looking 1191 + * for the indexed entries. Later, we might still want to 1192 + * journal both blocks, based on free list state. 1193 + */ 1274 1194 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 1275 1195 if (ocfs2_dx_root_inline(dx_root)) { 1276 1196 entry_list = &dx_root->dr_entries; ··· 1307 1203 } 1308 1204 1309 1205 /* 1206 + * We know that removal of this dirent will leave enough room 1207 + * for a new one, so add this block to the free list if it 1208 + * isn't already there. 1209 + */ 1210 + trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb); 1211 + if (trailer->db_free_rec_len == 0) 1212 + add_to_free_list = 1; 1213 + 1214 + /* 1310 1215 * Add the block holding our index into the journal before 1311 1216 * removing the unindexed entry. If we get an error return 1312 1217 * from __ocfs2_delete_entry(), then it hasn't removed the ··· 1325 1212 * We're also careful to journal the root tree block here if 1326 1213 * we're going to be adding to the start of the free list. 1327 1214 */ 1328 - if (ocfs2_dx_root_inline(dx_root)) { 1215 + if (add_to_free_list || ocfs2_dx_root_inline(dx_root)) { 1329 1216 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, 1330 1217 OCFS2_JOURNAL_ACCESS_WRITE); 1331 1218 if (ret) { 1332 1219 mlog_errno(ret); 1333 1220 goto out; 1334 1221 } 1335 - } else { 1222 + } 1223 + 1224 + if (!ocfs2_dx_root_inline(dx_root)) { 1336 1225 ret = ocfs2_journal_access_dl(handle, dir, 1337 1226 lookup->dl_dx_leaf_bh, 1338 1227 OCFS2_JOURNAL_ACCESS_WRITE); ··· 1353 1238 mlog_errno(ret); 1354 1239 goto out; 1355 1240 } 1241 + 1242 + max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh); 1243 + trailer->db_free_rec_len = cpu_to_le16(max_rec_len); 1244 + if (add_to_free_list) { 1245 + trailer->db_free_next = dx_root->dr_free_blk; 1246 + dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr); 1247 + ocfs2_journal_dirty(handle, dx_root_bh); 1248 + } 1249 + 1250 + /* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */ 1251 + ocfs2_journal_dirty(handle, leaf_bh); 1356 1252 1357 1253 ocfs2_dx_list_remove_entry(entry_list, index); 1358 1254 ··· 1548 1422 lookup->dl_dx_leaf_bh); 1549 1423 } 1550 1424 1425 + static void ocfs2_remove_block_from_free_list(struct inode *dir, 1426 + handle_t *handle, 1427 + struct ocfs2_dir_lookup_result *lookup) 1428 + { 1429 + struct ocfs2_dir_block_trailer *trailer, *prev; 1430 + struct ocfs2_dx_root_block *dx_root; 1431 + struct buffer_head *bh; 1432 + 1433 + trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb); 1434 + 1435 + if (ocfs2_free_list_at_root(lookup)) { 1436 + bh = lookup->dl_dx_root_bh; 1437 + dx_root = (struct ocfs2_dx_root_block *)bh->b_data; 1438 + dx_root->dr_free_blk = trailer->db_free_next; 1439 + } else { 1440 + bh = lookup->dl_prev_leaf_bh; 1441 + prev = ocfs2_trailer_from_bh(bh, dir->i_sb); 1442 + prev->db_free_next = trailer->db_free_next; 1443 + } 1444 + 1445 + trailer->db_free_rec_len = cpu_to_le16(0); 1446 + trailer->db_free_next = cpu_to_le64(0); 1447 + 1448 + ocfs2_journal_dirty(handle, bh); 1449 + ocfs2_journal_dirty(handle, lookup->dl_leaf_bh); 1450 + } 1451 + 1452 + /* 1453 + * This expects that a journal write has been reserved on 1454 + * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh 1455 + */ 1456 + static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle, 1457 + struct ocfs2_dir_lookup_result *lookup) 1458 + { 1459 + int max_rec_len; 1460 + struct ocfs2_dir_block_trailer *trailer; 1461 + 1462 + /* Walk dl_leaf_bh to figure out what the new free rec_len is. */ 1463 + max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh); 1464 + if (max_rec_len) { 1465 + /* 1466 + * There's still room in this block, so no need to remove it 1467 + * from the free list. In this case, we just want to update 1468 + * the rec len accounting. 1469 + */ 1470 + trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb); 1471 + trailer->db_free_rec_len = cpu_to_le16(max_rec_len); 1472 + ocfs2_journal_dirty(handle, lookup->dl_leaf_bh); 1473 + } else { 1474 + ocfs2_remove_block_from_free_list(dir, handle, lookup); 1475 + } 1476 + } 1477 + 1551 1478 /* we don't always have a dentry for what we want to add, so people 1552 1479 * like orphan dir can call this instead. 1553 1480 * ··· 1629 1450 if (!namelen) 1630 1451 return -EINVAL; 1631 1452 1632 - if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1453 + if (ocfs2_dir_indexed(dir)) { 1454 + struct buffer_head *bh; 1455 + 1456 + /* 1457 + * An indexed dir may require that we update the free space 1458 + * list. Reserve a write to the previous node in the list so 1459 + * that we don't fail later. 1460 + * 1461 + * XXX: This can be either a dx_root_block, or an unindexed 1462 + * directory tree leaf block. 1463 + */ 1464 + if (ocfs2_free_list_at_root(lookup)) { 1465 + bh = lookup->dl_dx_root_bh; 1466 + retval = ocfs2_journal_access_dr(handle, dir, bh, 1467 + OCFS2_JOURNAL_ACCESS_WRITE); 1468 + } else { 1469 + bh = lookup->dl_prev_leaf_bh; 1470 + retval = ocfs2_journal_access_db(handle, dir, bh, 1471 + OCFS2_JOURNAL_ACCESS_WRITE); 1472 + } 1473 + if (retval) { 1474 + mlog_errno(retval); 1475 + return retval; 1476 + } 1477 + } else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1633 1478 data_start = di->id2.i_data.id_data; 1634 1479 size = i_size_read(dir); 1635 1480 ··· 1735 1532 de->inode = 0; 1736 1533 de->name_len = namelen; 1737 1534 memcpy(de->name, name, namelen); 1535 + 1536 + if (ocfs2_dir_indexed(dir)) 1537 + ocfs2_recalc_free_list(dir, handle, lookup); 1738 1538 1739 1539 dir->i_version++; 1740 1540 status = ocfs2_journal_dirty(handle, insert_bh); ··· 2262 2056 2263 2057 mlog_entry_void(); 2264 2058 2265 - if (ocfs2_supports_dir_trailer(osb)) 2059 + if (ocfs2_new_dir_wants_trailer(inode)) 2266 2060 size = ocfs2_dir_trailer_blk_off(parent->i_sb); 2267 2061 2268 2062 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, ··· 2283 2077 memset(new_bh->b_data, 0, osb->sb->s_blocksize); 2284 2078 2285 2079 de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size); 2286 - if (ocfs2_supports_dir_trailer(osb)) 2287 - ocfs2_init_dir_trailer(inode, new_bh); 2080 + if (ocfs2_new_dir_wants_trailer(inode)) { 2081 + int size = le16_to_cpu(de->rec_len); 2082 + 2083 + /* 2084 + * Figure out the size of the hole left over after 2085 + * insertion of '.' and '..'. The trailer wants this 2086 + * information. 2087 + */ 2088 + size -= OCFS2_DIR_REC_LEN(2); 2089 + size -= sizeof(struct ocfs2_dir_block_trailer); 2090 + 2091 + ocfs2_init_dir_trailer(inode, new_bh, size); 2092 + } 2288 2093 2289 2094 status = ocfs2_journal_dirty(handle, new_bh); 2290 2095 if (status < 0) { ··· 2327 2110 static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, 2328 2111 handle_t *handle, struct inode *dir, 2329 2112 struct buffer_head *di_bh, 2113 + struct buffer_head *dirdata_bh, 2330 2114 struct ocfs2_alloc_context *meta_ac, 2331 2115 int dx_inline, 2332 2116 struct buffer_head **ret_dx_root_bh) ··· 2339 2121 unsigned int num_bits; 2340 2122 struct buffer_head *dx_root_bh = NULL; 2341 2123 struct ocfs2_dx_root_block *dx_root; 2124 + struct ocfs2_dir_block_trailer *trailer = 2125 + ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb); 2342 2126 2343 2127 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit, 2344 2128 &num_bits, &dr_blkno); ··· 2375 2155 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation); 2376 2156 dx_root->dr_blkno = cpu_to_le64(dr_blkno); 2377 2157 dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno); 2158 + if (le16_to_cpu(trailer->db_free_rec_len)) 2159 + dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr); 2160 + else 2161 + dx_root->dr_free_blk = cpu_to_le64(0); 2378 2162 2379 2163 if (dx_inline) { 2380 2164 dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE; ··· 2585 2361 goto out; 2586 2362 } 2587 2363 2588 - ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, 2364 + ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh, 2589 2365 meta_ac, 1, &dx_root_bh); 2590 2366 if (ret) { 2591 2367 mlog_errno(ret); ··· 2595 2371 entry_list = &dx_root->dr_entries; 2596 2372 2597 2373 /* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */ 2374 + ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo); 2598 2375 ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr); 2599 2376 2600 2377 ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo); ··· 2671 2446 out: 2672 2447 return ret; 2673 2448 } 2674 - /* 2449 + 2450 + /* 2675 2451 * XXX: This expects dx_root_bh to already be part of the transaction. 2676 2452 */ 2677 2453 static void ocfs2_dx_dir_index_root_block(struct inode *dir, ··· 2747 2521 * expansion from an inline directory to one with extents. The first dir block 2748 2522 * in that case is taken from the inline data portion of the inode block. 2749 2523 * 2524 + * This will also return the largest amount of contiguous space for a dirent 2525 + * in the block. That value is *not* necessarily the last dirent, even after 2526 + * expansion. The directory indexing code wants this value for free space 2527 + * accounting. We do this here since we're already walking the entire dir 2528 + * block. 2529 + * 2750 2530 * We add the dir trailer if this filesystem wants it. 2751 2531 */ 2752 - static void ocfs2_expand_last_dirent(char *start, unsigned int old_size, 2753 - struct super_block *sb) 2532 + static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size, 2533 + struct inode *dir) 2754 2534 { 2535 + struct super_block *sb = dir->i_sb; 2755 2536 struct ocfs2_dir_entry *de; 2756 2537 struct ocfs2_dir_entry *prev_de; 2757 2538 char *de_buf, *limit; 2758 2539 unsigned int new_size = sb->s_blocksize; 2759 - unsigned int bytes; 2540 + unsigned int bytes, this_hole; 2541 + unsigned int largest_hole = 0; 2760 2542 2761 - if (ocfs2_supports_dir_trailer(OCFS2_SB(sb))) 2543 + if (ocfs2_new_dir_wants_trailer(dir)) 2762 2544 new_size = ocfs2_dir_trailer_blk_off(sb); 2763 2545 2764 2546 bytes = new_size - old_size; ··· 2775 2541 de_buf = start; 2776 2542 de = (struct ocfs2_dir_entry *)de_buf; 2777 2543 do { 2544 + this_hole = ocfs2_figure_dirent_hole(de); 2545 + if (this_hole > largest_hole) 2546 + largest_hole = this_hole; 2547 + 2778 2548 prev_de = de; 2779 2549 de_buf += le16_to_cpu(de->rec_len); 2780 2550 de = (struct ocfs2_dir_entry *)de_buf; 2781 2551 } while (de_buf < limit); 2782 2552 2783 2553 le16_add_cpu(&prev_de->rec_len, bytes); 2554 + 2555 + /* We need to double check this after modification of the final 2556 + * dirent. */ 2557 + this_hole = ocfs2_figure_dirent_hole(prev_de); 2558 + if (this_hole > largest_hole) 2559 + largest_hole = this_hole; 2560 + 2561 + if (largest_hole >= OCFS2_DIR_MIN_REC_LEN) 2562 + return largest_hole; 2563 + return 0; 2784 2564 } 2785 2565 2786 2566 /* ··· 2951 2703 memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir)); 2952 2704 memset(dirdata_bh->b_data + i_size_read(dir), 0, 2953 2705 sb->s_blocksize - i_size_read(dir)); 2954 - ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb); 2955 - if (ocfs2_supports_dir_trailer(osb)) 2956 - ocfs2_init_dir_trailer(dir, dirdata_bh); 2706 + i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir); 2707 + if (ocfs2_new_dir_wants_trailer(dir)) { 2708 + /* 2709 + * Prepare the dir trailer up front. It will otherwise look 2710 + * like a valid dirent. Even if inserting the index fails 2711 + * (unlikely), then all we'll have done is given first dir 2712 + * block a small amount of fragmentation. 2713 + */ 2714 + ocfs2_init_dir_trailer(dir, dirdata_bh, i); 2715 + } 2957 2716 2958 2717 ret = ocfs2_journal_dirty(handle, dirdata_bh); 2959 2718 if (ret) { ··· 3036 2781 3037 2782 if (ocfs2_supports_indexed_dirs(osb)) { 3038 2783 ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh, 3039 - meta_ac, dx_inline, 2784 + dirdata_bh, meta_ac, dx_inline, 3040 2785 &dx_root_bh); 3041 2786 if (ret) { 3042 2787 mlog_errno(ret); ··· 3188 2933 * is to be turned into an extent based one. The size of the dirent to 3189 2934 * insert might be larger than the space gained by growing to just one 3190 2935 * block, so we may have to grow the inode by two blocks in that case. 2936 + * 2937 + * If the directory is already indexed, dx_root_bh must be provided. 3191 2938 */ 3192 2939 static int ocfs2_extend_dir(struct ocfs2_super *osb, 3193 2940 struct inode *dir, ··· 3210 2953 struct ocfs2_dir_entry * de; 3211 2954 struct super_block *sb = osb->sb; 3212 2955 struct ocfs2_extent_tree et; 2956 + struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; 3213 2957 3214 2958 mlog_entry_void(); 3215 2959 3216 2960 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 2961 + /* 2962 + * This would be a code error as an inline directory should 2963 + * never have an index root. 2964 + */ 2965 + BUG_ON(dx_root_bh); 2966 + 3217 2967 status = ocfs2_expand_inline_dir(dir, parent_fe_bh, 3218 2968 blocks_wanted, lookup, 3219 2969 &new_bh); ··· 3228 2964 mlog_errno(status); 3229 2965 goto bail; 3230 2966 } 2967 + 2968 + /* Expansion from inline to an indexed directory will 2969 + * have given us this. */ 2970 + dx_root_bh = lookup->dl_dx_root_bh; 3231 2971 3232 2972 if (blocks_wanted == 1) { 3233 2973 /* ··· 3296 3028 } 3297 3029 3298 3030 do_extend: 3031 + if (ocfs2_dir_indexed(dir)) 3032 + credits++; /* For attaching the new dirent block to the 3033 + * dx_root */ 3034 + 3299 3035 down_write(&OCFS2_I(dir)->ip_alloc_sem); 3300 3036 drop_alloc_sem = 1; 3301 3037 ··· 3330 3058 3331 3059 de = (struct ocfs2_dir_entry *) new_bh->b_data; 3332 3060 de->inode = 0; 3333 - if (ocfs2_dir_has_trailer(dir)) { 3061 + if (ocfs2_supports_dir_trailer(dir)) { 3334 3062 de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb)); 3335 - ocfs2_init_dir_trailer(dir, new_bh); 3063 + 3064 + ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len)); 3065 + 3066 + if (ocfs2_dir_indexed(dir)) { 3067 + status = ocfs2_dx_dir_link_trailer(dir, handle, 3068 + dx_root_bh, new_bh); 3069 + if (status) { 3070 + mlog_errno(status); 3071 + goto bail; 3072 + } 3073 + } 3336 3074 } else { 3337 3075 de->rec_len = cpu_to_le16(sb->s_blocksize); 3338 3076 } ··· 3398 3116 * This calculates how many free bytes we'd have in block zero, should 3399 3117 * this function force expansion to an extent tree. 3400 3118 */ 3401 - if (ocfs2_supports_dir_trailer(OCFS2_SB(sb))) 3119 + if (ocfs2_new_dir_wants_trailer(dir)) 3402 3120 free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir); 3403 3121 else 3404 3122 free_space = dir->i_sb->s_blocksize - i_size_read(dir); ··· 3929 3647 return ret; 3930 3648 } 3931 3649 3650 + static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir, 3651 + struct buffer_head *di_bh, 3652 + struct buffer_head *dx_root_bh, 3653 + const char *name, int namelen, 3654 + struct ocfs2_dir_lookup_result *lookup) 3655 + { 3656 + int ret, rebalanced = 0; 3657 + struct ocfs2_dx_root_block *dx_root; 3658 + struct buffer_head *dx_leaf_bh = NULL; 3659 + struct ocfs2_dx_leaf *dx_leaf; 3660 + u64 blkno; 3661 + u32 leaf_cpos; 3662 + 3663 + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 3664 + 3665 + restart_search: 3666 + ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo, 3667 + &leaf_cpos, &blkno); 3668 + if (ret) { 3669 + mlog_errno(ret); 3670 + goto out; 3671 + } 3672 + 3673 + ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh); 3674 + if (ret) { 3675 + mlog_errno(ret); 3676 + goto out; 3677 + } 3678 + 3679 + dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data; 3680 + 3681 + if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >= 3682 + le16_to_cpu(dx_leaf->dl_list.de_count)) { 3683 + if (rebalanced) { 3684 + /* 3685 + * Rebalancing should have provided us with 3686 + * space in an appropriate leaf. 3687 + * 3688 + * XXX: Is this an abnormal condition then? 3689 + * Should we print a message here? 3690 + */ 3691 + ret = -ENOSPC; 3692 + goto out; 3693 + } 3694 + 3695 + ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh, 3696 + &lookup->dl_hinfo, leaf_cpos, 3697 + blkno); 3698 + if (ret) { 3699 + if (ret != -ENOSPC) 3700 + mlog_errno(ret); 3701 + goto out; 3702 + } 3703 + 3704 + /* 3705 + * Restart the lookup. The rebalance might have 3706 + * changed which block our item fits into. Mark our 3707 + * progress, so we only execute this once. 3708 + */ 3709 + brelse(dx_leaf_bh); 3710 + dx_leaf_bh = NULL; 3711 + rebalanced = 1; 3712 + goto restart_search; 3713 + } 3714 + 3715 + lookup->dl_dx_leaf_bh = dx_leaf_bh; 3716 + dx_leaf_bh = NULL; 3717 + 3718 + out: 3719 + brelse(dx_leaf_bh); 3720 + return ret; 3721 + } 3722 + 3723 + static int ocfs2_search_dx_free_list(struct inode *dir, 3724 + struct buffer_head *dx_root_bh, 3725 + int namelen, 3726 + struct ocfs2_dir_lookup_result *lookup) 3727 + { 3728 + int ret = -ENOSPC; 3729 + struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL; 3730 + struct ocfs2_dir_block_trailer *db; 3731 + u64 next_block; 3732 + int rec_len = OCFS2_DIR_REC_LEN(namelen); 3733 + struct ocfs2_dx_root_block *dx_root; 3734 + 3735 + dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 3736 + next_block = le64_to_cpu(dx_root->dr_free_blk); 3737 + 3738 + while (next_block) { 3739 + brelse(prev_leaf_bh); 3740 + prev_leaf_bh = leaf_bh; 3741 + leaf_bh = NULL; 3742 + 3743 + ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh); 3744 + if (ret) { 3745 + mlog_errno(ret); 3746 + goto out; 3747 + } 3748 + 3749 + db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb); 3750 + if (rec_len <= le16_to_cpu(db->db_free_rec_len)) { 3751 + lookup->dl_leaf_bh = leaf_bh; 3752 + lookup->dl_prev_leaf_bh = prev_leaf_bh; 3753 + leaf_bh = NULL; 3754 + prev_leaf_bh = NULL; 3755 + break; 3756 + } 3757 + 3758 + next_block = le64_to_cpu(db->db_free_next); 3759 + } 3760 + 3761 + if (!next_block) 3762 + ret = -ENOSPC; 3763 + 3764 + out: 3765 + 3766 + brelse(leaf_bh); 3767 + brelse(prev_leaf_bh); 3768 + return ret; 3769 + } 3770 + 3932 3771 static int ocfs2_expand_inline_dx_root(struct inode *dir, 3933 3772 struct buffer_head *dx_root_bh) 3934 3773 { ··· 4182 3779 return 0; 4183 3780 } 4184 3781 4185 - static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir, 4186 - struct buffer_head *di_bh, const char *name, 4187 - int namelen, 4188 - struct ocfs2_dir_lookup_result *lookup) 3782 + static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir, 3783 + struct buffer_head *di_bh, 3784 + const char *name, 3785 + int namelen, 3786 + struct ocfs2_dir_lookup_result *lookup) 4189 3787 { 4190 - int ret, rebalanced = 0; 3788 + int ret, free_dx_root = 1; 3789 + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 4191 3790 struct buffer_head *dx_root_bh = NULL; 4192 - struct ocfs2_dx_root_block *dx_root; 4193 - struct buffer_head *dx_leaf_bh = NULL; 4194 - struct ocfs2_dx_leaf *dx_leaf; 3791 + struct buffer_head *leaf_bh = NULL; 4195 3792 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 4196 - u64 blkno; 4197 - u32 leaf_cpos; 3793 + struct ocfs2_dx_root_block *dx_root; 4198 3794 4199 3795 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh); 4200 3796 if (ret) { ··· 4220 3818 } 4221 3819 } 4222 3820 4223 - restart_search: 4224 - ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo, 4225 - &leaf_cpos, &blkno); 3821 + /* 3822 + * Insert preparation for an indexed directory is split into two 3823 + * steps. The call to find_dir_space_dx reserves room in the index for 3824 + * an additional item. If we run out of space there, it's a real error 3825 + * we can't continue on. 3826 + */ 3827 + ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name, 3828 + namelen, lookup); 4226 3829 if (ret) { 4227 3830 mlog_errno(ret); 4228 3831 goto out; 4229 3832 } 4230 3833 4231 - ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh); 4232 - if (ret) { 3834 + search_el: 3835 + /* 3836 + * Next, we need to find space in the unindexed tree. This call 3837 + * searches using the free space linked list. If the unindexed tree 3838 + * lacks sufficient space, we'll expand it below. The expansion code 3839 + * is smart enough to add any new blocks to the free space list. 3840 + */ 3841 + ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup); 3842 + if (ret && ret != -ENOSPC) { 4233 3843 mlog_errno(ret); 4234 3844 goto out; 4235 3845 } 4236 3846 4237 - dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data; 3847 + /* Do this up here - ocfs2_extend_dir might need the dx_root */ 3848 + lookup->dl_dx_root_bh = dx_root_bh; 3849 + free_dx_root = 0; 4238 3850 4239 - if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >= 4240 - le16_to_cpu(dx_leaf->dl_list.de_count)) { 4241 - if (rebalanced) { 4242 - /* 4243 - * Rebalancing should have provided us with 4244 - * space in an appropriate leaf. 4245 - * 4246 - * XXX: Is this an abnormal condition then? 4247 - * Should we print a message here? 4248 - */ 4249 - ret = -ENOSPC; 4250 - goto out; 4251 - } 3851 + if (ret == -ENOSPC) { 3852 + ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh); 4252 3853 4253 - ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh, 4254 - &lookup->dl_hinfo, leaf_cpos, 4255 - blkno); 4256 3854 if (ret) { 4257 - if (ret != -ENOSPC) 4258 - mlog_errno(ret); 3855 + mlog_errno(ret); 4259 3856 goto out; 4260 3857 } 4261 3858 4262 3859 /* 4263 - * Restart the lookup. The rebalance might have 4264 - * changed which block our item fits into. Mark our 4265 - * progress, so we only execute this once. 3860 + * We make the assumption here that new leaf blocks are added 3861 + * to the front of our free list. 4266 3862 */ 4267 - brelse(dx_leaf_bh); 4268 - dx_leaf_bh = NULL; 4269 - rebalanced = 1; 4270 - goto restart_search; 3863 + lookup->dl_prev_leaf_bh = NULL; 3864 + lookup->dl_leaf_bh = leaf_bh; 4271 3865 } 4272 3866 4273 - search_el: 4274 - lookup->dl_dx_leaf_bh = dx_leaf_bh; 4275 - dx_leaf_bh = NULL; 4276 - lookup->dl_dx_root_bh = dx_root_bh; 4277 - dx_root_bh = NULL; 4278 - 4279 3867 out: 4280 - brelse(dx_leaf_bh); 4281 - brelse(dx_root_bh); 3868 + if (free_dx_root) 3869 + brelse(dx_root_bh); 4282 3870 return ret; 4283 3871 } 4284 3872 ··· 4313 3921 ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo); 4314 3922 4315 3923 if (ocfs2_dir_indexed(dir)) { 4316 - ret = ocfs2_find_dir_space_dx(osb, dir, parent_fe_bh, name, 4317 - namelen, lookup); 4318 - if (ret) { 3924 + ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh, 3925 + name, namelen, lookup); 3926 + if (ret) 4319 3927 mlog_errno(ret); 4320 - goto out; 4321 - } 4322 - 4323 - /* 4324 - * We intentionally fall through so that the unindexed 4325 - * tree can also be prepared. 4326 - */ 3928 + goto out; 4327 3929 } 4328 3930 4329 3931 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+8
fs/ocfs2/dir.h
··· 39 39 40 40 struct buffer_head *dl_dx_root_bh; /* Root of indexed 41 41 * tree */ 42 + 42 43 struct buffer_head *dl_dx_leaf_bh; /* Indexed leaf block */ 43 44 struct ocfs2_dx_entry *dl_dx_entry; /* Target dx_entry in 44 45 * indexed leaf */ 45 46 struct ocfs2_dx_hinfo dl_hinfo; /* Name hash results */ 47 + 48 + struct buffer_head *dl_prev_leaf_bh;/* Previous entry in 49 + * dir free space 50 + * list. NULL if 51 + * previous entry is 52 + * dx root block. */ 46 53 }; 54 + 47 55 void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res); 48 56 49 57 int ocfs2_find_entry(const char *name, int namelen,
+6 -6
fs/ocfs2/journal.h
··· 385 385 } 386 386 387 387 /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + 388 - * bitmap block for the new bit) */ 389 - #define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) 388 + * bitmap block for the new bit) dx_root update for free list */ 389 + #define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1) 390 390 391 391 static inline int ocfs2_add_dir_index_credits(struct super_block *sb) 392 392 { ··· 420 420 #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) 421 421 422 422 /* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota 423 - * update on dir + index leaf */ 423 + * update on dir + index leaf + dx root update for free list */ 424 424 static inline int ocfs2_link_credits(struct super_block *sb) 425 425 { 426 - return 2*OCFS2_INODE_UPDATE_CREDITS + 2 + 426 + return 2*OCFS2_INODE_UPDATE_CREDITS + 3 + 427 427 ocfs2_quota_trans_credits(sb); 428 428 } 429 429 430 430 /* inode + dir inode (if we unlink a dir), + dir entry block + orphan 431 - * dir inode link + dir inode index leaf */ 431 + * dir inode link + dir inode index leaf + dir index root */ 432 432 static inline int ocfs2_unlink_credits(struct super_block *sb) 433 433 { 434 434 /* The quota update from ocfs2_link_credits is unused here... */ 435 - return 2 * OCFS2_INODE_UPDATE_CREDITS + 2 + ocfs2_link_credits(sb); 435 + return 2 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_link_credits(sb); 436 436 } 437 437 438 438 /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
+4 -1
fs/ocfs2/ocfs2_fs.h
··· 416 416 #define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \ 417 417 OCFS2_DIR_ROUND) & \ 418 418 ~OCFS2_DIR_ROUND) 419 + #define OCFS2_DIR_MIN_REC_LEN OCFS2_DIR_REC_LEN(1) 419 420 420 421 #define OCFS2_LINK_MAX 32000 421 422 ··· 843 842 __le16 dr_reserved1; 844 843 __le64 dr_dir_blkno; /* Pointer to parent inode */ 845 844 __le64 dr_reserved2; 846 - __le64 dr_reserved3[16]; 845 + __le64 dr_free_blk; /* Pointer to head of free 846 + * unindexed block list. */ 847 + __le64 dr_reserved3[15]; 847 848 union { 848 849 struct ocfs2_extent_list dr_list; /* Keep this aligned to 128 849 850 * bits for maximum space