Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext4: add largedir feature

This INCOMPAT_LARGEDIR feature allows larger directories to be created
in ldiskfs, both with directory sizes over 2GB and and a maximum htree
depth of 3 instead of the current limit of 2. These features are needed
in order to exceed the current limit of approximately 10M entries in a
single directory.

This patch was originally written by Yang Sheng to support the Lustre server.

[ Bumped the credits needed to update an indexed directory -- tytso ]

Signed-off-by: Liang Zhen <liang.zhen@intel.com>
Signed-off-by: Yang Sheng <yang.sheng@intel.com>
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@seagate.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>

authored by

Artem Blagodarenko and committed by
Theodore Ts'o
e08ac99f 67a7d5f5

+113 -47
+18 -5
fs/ext4/ext4.h
··· 1800 1800 EXT4_FEATURE_INCOMPAT_MMP | \ 1801 1801 EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ 1802 1802 EXT4_FEATURE_INCOMPAT_ENCRYPT | \ 1803 - EXT4_FEATURE_INCOMPAT_CSUM_SEED) 1803 + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ 1804 + EXT4_FEATURE_INCOMPAT_LARGEDIR) 1804 1805 #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ 1805 1806 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ 1806 1807 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ ··· 2126 2125 * Special error return code only used by dx_probe() and its callers. 2127 2126 */ 2128 2127 #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) 2128 + 2129 + /* htree levels for ext4 */ 2130 + #define EXT4_HTREE_LEVEL_COMPAT 2 2131 + #define EXT4_HTREE_LEVEL 3 2132 + 2133 + static inline int ext4_dir_htree_level(struct super_block *sb) 2134 + { 2135 + return ext4_has_feature_largedir(sb) ? 2136 + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; 2137 + } 2129 2138 2130 2139 /* 2131 2140 * Timeout and state flag for lazy initialization inode thread. ··· 2767 2756 es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); 2768 2757 } 2769 2758 2770 - static inline loff_t ext4_isize(struct ext4_inode *raw_inode) 2759 + static inline loff_t ext4_isize(struct super_block *sb, 2760 + struct ext4_inode *raw_inode) 2771 2761 { 2772 - if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) 2762 + if (ext4_has_feature_largedir(sb) || 2763 + S_ISREG(le16_to_cpu(raw_inode->i_mode))) 2773 2764 return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | 2774 2765 le32_to_cpu(raw_inode->i_size_lo); 2775 - else 2776 - return (loff_t) le32_to_cpu(raw_inode->i_size_lo); 2766 + 2767 + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); 2777 2768 } 2778 2769 2779 2770 static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
+8 -1
fs/ext4/ext4_jbd2.h
··· 77 77 78 78 #define EXT4_RESERVE_TRANS_BLOCKS 12U 79 79 80 - #define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8 80 + /* 81 + * Number of credits needed if we need to insert an entry into a 82 + * directory. For each new index block, we need 4 blocks (old index 83 + * block, new index block, bitmap block, bg summary). For normal 84 + * htree directories there are 2 levels; if the largedir feature 85 + * enabled it's 3 levels. 86 + */ 87 + #define EXT4_INDEX_EXTRA_TRANS_BLOCKS 12U 81 88 82 89 #ifdef CONFIG_QUOTA 83 90 /* Amount of blocks needed for quota update - we know that the structure was
+2 -2
fs/ext4/inode.c
··· 4712 4712 if (ext4_has_feature_64bit(sb)) 4713 4713 ei->i_file_acl |= 4714 4714 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; 4715 - inode->i_size = ext4_isize(raw_inode); 4715 + inode->i_size = ext4_isize(sb, raw_inode); 4716 4716 if ((size = i_size_read(inode)) < 0) { 4717 4717 EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); 4718 4718 ret = -EFSCORRUPTED; ··· 5037 5037 raw_inode->i_file_acl_high = 5038 5038 cpu_to_le16(ei->i_file_acl >> 32); 5039 5039 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); 5040 - if (ei->i_disksize != ext4_isize(raw_inode)) { 5040 + if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) { 5041 5041 ext4_isize_set(raw_inode, ei->i_disksize); 5042 5042 need_datasync = 1; 5043 5043 }
+85 -39
fs/ext4/namei.c
··· 513 513 514 514 static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) 515 515 { 516 - return le32_to_cpu(entry->block) & 0x00ffffff; 516 + return le32_to_cpu(entry->block) & 0x0fffffff; 517 517 } 518 518 519 519 static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) ··· 739 739 struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); 740 740 u32 hash; 741 741 742 + memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); 742 743 frame->bh = ext4_read_dirblock(dir, 0, INDEX); 743 744 if (IS_ERR(frame->bh)) 744 745 return (struct dx_frame *) frame->bh; ··· 769 768 } 770 769 771 770 indirect = root->info.indirect_levels; 772 - if (indirect > 1) { 773 - ext4_warning_inode(dir, "Unimplemented hash depth: %#06x", 774 - root->info.indirect_levels); 771 + if (indirect >= ext4_dir_htree_level(dir->i_sb)) { 772 + ext4_warning(dir->i_sb, 773 + "Directory (ino: %lu) htree depth %#06x exceed" 774 + "supported value", dir->i_ino, 775 + ext4_dir_htree_level(dir->i_sb)); 776 + if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { 777 + ext4_warning(dir->i_sb, "Enable large directory " 778 + "feature to access it"); 779 + } 775 780 goto fail; 776 781 } 777 782 ··· 866 859 867 860 static void dx_release(struct dx_frame *frames) 868 861 { 862 + struct dx_root_info *info; 863 + int i; 864 + 869 865 if (frames[0].bh == NULL) 870 866 return; 871 867 872 - if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels) 873 - brelse(frames[1].bh); 874 - brelse(frames[0].bh); 868 + info = &((struct dx_root *)frames[0].bh->b_data)->info; 869 + for (i = 0; i <= info->indirect_levels; i++) { 870 + if (frames[i].bh == NULL) 871 + break; 872 + brelse(frames[i].bh); 873 + frames[i].bh = NULL; 874 + } 875 875 } 876 876 877 877 /* ··· 1064 1050 { 1065 1051 struct dx_hash_info hinfo; 1066 1052 struct ext4_dir_entry_2 *de; 1067 - struct dx_frame frames[2], *frame; 1053 + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; 1068 1054 struct inode *dir; 1069 1055 ext4_lblk_t block; 1070 1056 int count = 0; ··· 1499 1485 struct ext4_dir_entry_2 **res_dir) 1500 1486 { 1501 1487 struct super_block * sb = dir->i_sb; 1502 - struct dx_frame frames[2], *frame; 1488 + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; 1503 1489 struct buffer_head *bh; 1504 1490 ext4_lblk_t block; 1505 1491 int retval; ··· 1903 1889 */ 1904 1890 dir->i_mtime = dir->i_ctime = current_time(dir); 1905 1891 ext4_update_dx_flag(dir); 1906 - dir->i_version++; 1892 + inode_inc_iversion(dir); 1907 1893 ext4_mark_inode_dirty(handle, dir); 1908 1894 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 1909 1895 err = ext4_handle_dirty_dirent_node(handle, dir, bh); ··· 1922 1908 { 1923 1909 struct buffer_head *bh2; 1924 1910 struct dx_root *root; 1925 - struct dx_frame frames[2], *frame; 1911 + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; 1926 1912 struct dx_entry *entries; 1927 1913 struct ext4_dir_entry_2 *de, *de2; 1928 1914 struct ext4_dir_entry_tail *t; ··· 2141 2127 static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, 2142 2128 struct inode *dir, struct inode *inode) 2143 2129 { 2144 - struct dx_frame frames[2], *frame; 2130 + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; 2145 2131 struct dx_entry *entries, *at; 2146 2132 struct buffer_head *bh; 2147 2133 struct super_block *sb = dir->i_sb; 2148 2134 struct ext4_dir_entry_2 *de; 2135 + int restart; 2149 2136 int err; 2150 2137 2138 + again: 2139 + restart = 0; 2151 2140 frame = dx_probe(fname, dir, NULL, frames); 2152 2141 if (IS_ERR(frame)) 2153 2142 return PTR_ERR(frame); ··· 2172 2155 if (err != -ENOSPC) 2173 2156 goto cleanup; 2174 2157 2158 + err = 0; 2175 2159 /* Block full, should compress but for now just split */ 2176 2160 dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", 2177 2161 dx_get_count(entries), dx_get_limit(entries))); 2178 2162 /* Need to split index? */ 2179 2163 if (dx_get_count(entries) == dx_get_limit(entries)) { 2180 2164 ext4_lblk_t newblock; 2181 - unsigned icount = dx_get_count(entries); 2182 - int levels = frame - frames; 2165 + int levels = frame - frames + 1; 2166 + unsigned int icount; 2167 + int add_level = 1; 2183 2168 struct dx_entry *entries2; 2184 2169 struct dx_node *node2; 2185 2170 struct buffer_head *bh2; 2186 2171 2187 - if (levels && (dx_get_count(frames->entries) == 2188 - dx_get_limit(frames->entries))) { 2189 - ext4_warning_inode(dir, "Directory index full!"); 2172 + while (frame > frames) { 2173 + if (dx_get_count((frame - 1)->entries) < 2174 + dx_get_limit((frame - 1)->entries)) { 2175 + add_level = 0; 2176 + break; 2177 + } 2178 + frame--; /* split higher index block */ 2179 + at = frame->at; 2180 + entries = frame->entries; 2181 + restart = 1; 2182 + } 2183 + if (add_level && levels == ext4_dir_htree_level(sb)) { 2184 + ext4_warning(sb, "Directory (ino: %lu) index full, " 2185 + "reach max htree level :%d", 2186 + dir->i_ino, levels); 2187 + if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { 2188 + ext4_warning(sb, "Large directory feature is " 2189 + "not enabled on this " 2190 + "filesystem"); 2191 + } 2190 2192 err = -ENOSPC; 2191 2193 goto cleanup; 2192 2194 } 2195 + icount = dx_get_count(entries); 2193 2196 bh2 = ext4_append(handle, dir, &newblock); 2194 2197 if (IS_ERR(bh2)) { 2195 2198 err = PTR_ERR(bh2); ··· 2224 2187 err = ext4_journal_get_write_access(handle, frame->bh); 2225 2188 if (err) 2226 2189 goto journal_error; 2227 - if (levels) { 2190 + if (!add_level) { 2228 2191 unsigned icount1 = icount/2, icount2 = icount - icount1; 2229 2192 unsigned hash2 = dx_get_hash(entries + icount1); 2230 2193 dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", ··· 2232 2195 2233 2196 BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ 2234 2197 err = ext4_journal_get_write_access(handle, 2235 - frames[0].bh); 2198 + (frame - 1)->bh); 2236 2199 if (err) 2237 2200 goto journal_error; 2238 2201 ··· 2248 2211 frame->entries = entries = entries2; 2249 2212 swap(frame->bh, bh2); 2250 2213 } 2251 - dx_insert_block(frames + 0, hash2, newblock); 2252 - dxtrace(dx_show_index("node", frames[1].entries)); 2214 + dx_insert_block((frame - 1), hash2, newblock); 2215 + dxtrace(dx_show_index("node", frame->entries)); 2253 2216 dxtrace(dx_show_index("node", 2254 2217 ((struct dx_node *) bh2->b_data)->entries)); 2255 2218 err = ext4_handle_dirty_dx_node(handle, dir, bh2); 2256 2219 if (err) 2257 2220 goto journal_error; 2258 2221 brelse (bh2); 2222 + err = ext4_handle_dirty_dx_node(handle, dir, 2223 + (frame - 1)->bh); 2224 + if (err) 2225 + goto journal_error; 2226 + if (restart) { 2227 + err = ext4_handle_dirty_dx_node(handle, dir, 2228 + frame->bh); 2229 + goto journal_error; 2230 + } 2259 2231 } else { 2260 - dxtrace(printk(KERN_DEBUG 2261 - "Creating second level index...\n")); 2232 + struct dx_root *dxroot; 2262 2233 memcpy((char *) entries2, (char *) entries, 2263 2234 icount * sizeof(struct dx_entry)); 2264 2235 dx_set_limit(entries2, dx_node_limit(dir)); ··· 2274 2229 /* Set up root */ 2275 2230 dx_set_count(entries, 1); 2276 2231 dx_set_block(entries + 0, newblock); 2277 - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; 2278 - 2279 - /* Add new access path frame */ 2280 - frame = frames + 1; 2281 - frame->at = at = at - entries + entries2; 2282 - frame->entries = entries = entries2; 2283 - frame->bh = bh2; 2284 - err = ext4_journal_get_write_access(handle, 2285 - frame->bh); 2232 + dxroot = (struct dx_root *)frames[0].bh->b_data; 2233 + dxroot->info.indirect_levels += 1; 2234 + dxtrace(printk(KERN_DEBUG 2235 + "Creating %d level index...\n", 2236 + info->indirect_levels)); 2237 + err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); 2286 2238 if (err) 2287 2239 goto journal_error; 2288 - } 2289 - err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh); 2290 - if (err) { 2291 - ext4_std_error(inode->i_sb, err); 2292 - goto cleanup; 2240 + err = ext4_handle_dirty_dx_node(handle, dir, bh2); 2241 + brelse(bh2); 2242 + restart = 1; 2243 + goto journal_error; 2293 2244 } 2294 2245 } 2295 2246 de = do_split(handle, dir, &bh, frame, &fname->hinfo); ··· 2297 2256 goto cleanup; 2298 2257 2299 2258 journal_error: 2300 - ext4_std_error(dir->i_sb, err); 2259 + ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */ 2301 2260 cleanup: 2302 2261 brelse(bh); 2303 2262 dx_release(frames); 2263 + /* @restart is true means htree-path has been changed, we need to 2264 + * repeat dx_probe() to find out valid htree-path 2265 + */ 2266 + if (restart && err == 0) 2267 + goto again; 2304 2268 return err; 2305 2269 } 2306 2270 ··· 2342 2296 blocksize); 2343 2297 else 2344 2298 de->inode = 0; 2345 - dir->i_version++; 2299 + inode_inc_iversion(dir); 2346 2300 return 0; 2347 2301 } 2348 2302 i += ext4_rec_len_from_disk(de->rec_len, blocksize);