Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

+13

Documentation/ABI/testing/sysfs-fs-ext4

··· 96 96 Description: 97 97 The maximum number of megabytes the writeback code will 98 98 try to write out before move on to another inode. 99 + 100 + What: /sys/fs/ext4/<disk>/extent_max_zeroout_kb 101 + Date: August 2012 102 + Contact: "Theodore Ts'o" <tytso@mit.edu> 103 + Description: 104 + The maximum number of kilobytes which will be zeroed 105 + out in preference to creating a new uninitialized 106 + extent when manipulating an inode's extent tree. Note 107 + that using a larger value will increase the 108 + variability of time necessary to complete a random 109 + write operation (since a 4k random write might turn 110 + into a much larger write due to the zeroout 111 + operation).

+10

Documentation/filesystems/ext4.txt

··· 375 375 Because of the restrictions this options comprises 376 376 it is off by default (e.g. dioread_lock). 377 377 378 + max_dir_size_kb=n This limits the size of directories so that any 379 + attempt to expand them beyond the specified 380 + limit in kilobytes will cause an ENOSPC error. 381 + This is useful in memory constrained 382 + environments, where a very large directory can 383 + cause severe performance problems or even 384 + provoke the Out Of Memory killer. (For example, 385 + if there is only 512mb memory available, a 176mb 386 + directory may seriously cramp the system's style.) 387 + 378 388 i_version Enable 64-bit inode version support. This option is 379 389 off by default. 380 390

+7 -6

fs/buffer.c

··· 2312 2312 loff_t size; 2313 2313 int ret; 2314 2314 2315 - /* 2316 - * Update file times before taking page lock. We may end up failing the 2317 - * fault so this update may be superfluous but who really cares... 2318 - */ 2319 - file_update_time(vma->vm_file); 2320 - 2321 2315 lock_page(page); 2322 2316 size = i_size_read(inode); 2323 2317 if ((page->mapping != inode->i_mapping) || ··· 2349 2355 struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb; 2350 2356 2351 2357 sb_start_pagefault(sb); 2358 + 2359 + /* 2360 + * Update file times before taking page lock. We may end up failing the 2361 + * fault so this update may be superfluous but who really cares... 2362 + */ 2363 + file_update_time(vma->vm_file); 2364 + 2352 2365 ret = __block_page_mkwrite(vma, vmf, get_block); 2353 2366 sb_end_pagefault(sb); 2354 2367 return block_page_mkwrite_return(ret);

+42 -7

fs/ext4/ext4.h

··· 186 186 #define EXT4_IO_END_ERROR 0x0002 187 187 #define EXT4_IO_END_QUEUED 0x0004 188 188 #define EXT4_IO_END_DIRECT 0x0008 189 - #define EXT4_IO_END_IN_FSYNC 0x0010 190 189 191 190 struct ext4_io_page { 192 191 struct page *p_page; ··· 911 912 struct list_head i_completed_io_list; 912 913 spinlock_t i_completed_io_lock; 913 914 atomic_t i_ioend_count; /* Number of outstanding io_end structs */ 914 - /* current io_end structure for async DIO write*/ 915 - ext4_io_end_t *cur_aio_dio; 916 - atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */ 915 + atomic_t i_unwritten; /* Nr. of inflight conversions pending */ 917 916 918 917 spinlock_t i_block_reservation_lock; 919 918 ··· 1230 1233 spinlock_t s_md_lock; 1231 1234 unsigned short *s_mb_offsets; 1232 1235 unsigned int *s_mb_maxs; 1236 + unsigned int s_group_info_size; 1233 1237 1234 1238 /* tunables */ 1235 1239 unsigned long s_stripe; ··· 1241 1243 unsigned int s_mb_order2_reqs; 1242 1244 unsigned int s_mb_group_prealloc; 1243 1245 unsigned int s_max_writeback_mb_bump; 1246 + unsigned int s_max_dir_size_kb; 1244 1247 /* where last allocation was done - for stream allocation */ 1245 1248 unsigned long s_mb_last_group; 1246 1249 unsigned long s_mb_last_start; ··· 1269 1270 unsigned long s_sectors_written_start; 1270 1271 u64 s_kbytes_written; 1271 1272 1273 + /* the size of zero-out chunk */ 1274 + unsigned int s_extent_max_zeroout_kb; 1275 + 1272 1276 unsigned int s_log_groups_per_flex; 1273 1277 struct flex_groups *s_flex_groups; 1278 + ext4_group_t s_flex_groups_allocated; 1274 1279 1275 1280 /* workqueue for dio unwritten */ 1276 1281 struct workqueue_struct *dio_unwritten_wq; ··· 1331 1328 { 1332 1329 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 1333 1330 io_end->flag |= EXT4_IO_END_UNWRITTEN; 1334 - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); 1331 + atomic_inc(&EXT4_I(inode)->i_unwritten); 1335 1332 } 1333 + } 1334 + 1335 + static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode) 1336 + { 1337 + return inode->i_private; 1338 + } 1339 + 1340 + static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io) 1341 + { 1342 + inode->i_private = io; 1336 1343 } 1337 1344 1338 1345 /* ··· 1358 1345 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ 1359 1346 EXT4_STATE_NEWENTRY, /* File just added to dir */ 1360 1347 EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ 1348 + EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read 1349 + nolocking */ 1361 1350 }; 1362 1351 1363 1352 #define EXT4_INODE_BIT_FNS(name, field, offset) \ ··· 1947 1932 1948 1933 /* fsync.c */ 1949 1934 extern int ext4_sync_file(struct file *, loff_t, loff_t, int); 1950 - extern int ext4_flush_completed_IO(struct inode *); 1935 + extern int ext4_flush_unwritten_io(struct inode *); 1951 1936 1952 1937 /* hash.c */ 1953 1938 extern int ext4fs_dirhash(const char *name, int len, struct ··· 1981 1966 extern void ext4_free_blocks(handle_t *handle, struct inode *inode, 1982 1967 struct buffer_head *bh, ext4_fsblk_t block, 1983 1968 unsigned long count, int flags); 1969 + extern int ext4_mb_alloc_groupinfo(struct super_block *sb, 1970 + ext4_group_t ngroups); 1984 1971 extern int ext4_mb_add_groupinfo(struct super_block *sb, 1985 1972 ext4_group_t i, struct ext4_group_desc *desc); 1986 1973 extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ··· 2068 2051 extern void *ext4_kvmalloc(size_t size, gfp_t flags); 2069 2052 extern void *ext4_kvzalloc(size_t size, gfp_t flags); 2070 2053 extern void ext4_kvfree(void *ptr); 2054 + extern int ext4_alloc_flex_bg_array(struct super_block *sb, 2055 + ext4_group_t ngroup); 2071 2056 extern __printf(4, 5) 2072 2057 void __ext4_error(struct super_block *, const char *, unsigned int, 2073 2058 const char *, ...); ··· 2371 2352 extern const struct inode_operations ext4_file_inode_operations; 2372 2353 extern const struct file_operations ext4_file_operations; 2373 2354 extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); 2355 + extern void ext4_unwritten_wait(struct inode *inode); 2374 2356 2375 2357 /* namei.c */ 2376 2358 extern const struct inode_operations ext4_dir_inode_operations; ··· 2420 2400 2421 2401 /* page-io.c */ 2422 2402 extern int __init ext4_init_pageio(void); 2403 + extern void ext4_add_complete_io(ext4_io_end_t *io_end); 2423 2404 extern void ext4_exit_pageio(void); 2424 2405 extern void ext4_ioend_wait(struct inode *); 2425 2406 extern void ext4_free_io_end(ext4_io_end_t *io); 2426 2407 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); 2427 - extern int ext4_end_io_nolock(ext4_io_end_t *io); 2428 2408 extern void ext4_io_submit(struct ext4_io_submit *io); 2429 2409 extern int ext4_bio_write_page(struct ext4_io_submit *io, 2430 2410 struct page *page, ··· 2470 2450 static inline void set_bitmap_uptodate(struct buffer_head *bh) 2471 2451 { 2472 2452 set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); 2453 + } 2454 + 2455 + /* 2456 + * Disable DIO read nolock optimization, so new dioreaders will be forced 2457 + * to grab i_mutex 2458 + */ 2459 + static inline void ext4_inode_block_unlocked_dio(struct inode *inode) 2460 + { 2461 + ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); 2462 + smp_mb(); 2463 + } 2464 + static inline void ext4_inode_resume_unlocked_dio(struct inode *inode) 2465 + { 2466 + smp_mb(); 2467 + ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); 2473 2468 } 2474 2469 2475 2470 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)

+151 -107

fs/ext4/extents.c

··· 1177 1177 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), 1178 1178 ext4_idx_pblock(EXT_FIRST_INDEX(neh))); 1179 1179 1180 - neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1); 1180 + le16_add_cpu(&neh->eh_depth, 1); 1181 1181 ext4_mark_inode_dirty(handle, inode); 1182 1182 out: 1183 1183 brelse(bh); ··· 1656 1656 } 1657 1657 1658 1658 /* 1659 + * This function does a very simple check to see if we can collapse 1660 + * an extent tree with a single extent tree leaf block into the inode. 1661 + */ 1662 + static void ext4_ext_try_to_merge_up(handle_t *handle, 1663 + struct inode *inode, 1664 + struct ext4_ext_path *path) 1665 + { 1666 + size_t s; 1667 + unsigned max_root = ext4_ext_space_root(inode, 0); 1668 + ext4_fsblk_t blk; 1669 + 1670 + if ((path[0].p_depth != 1) || 1671 + (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) || 1672 + (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root)) 1673 + return; 1674 + 1675 + /* 1676 + * We need to modify the block allocation bitmap and the block 1677 + * group descriptor to release the extent tree block. If we 1678 + * can't get the journal credits, give up. 1679 + */ 1680 + if (ext4_journal_extend(handle, 2)) 1681 + return; 1682 + 1683 + /* 1684 + * Copy the extent data up to the inode 1685 + */ 1686 + blk = ext4_idx_pblock(path[0].p_idx); 1687 + s = le16_to_cpu(path[1].p_hdr->eh_entries) * 1688 + sizeof(struct ext4_extent_idx); 1689 + s += sizeof(struct ext4_extent_header); 1690 + 1691 + memcpy(path[0].p_hdr, path[1].p_hdr, s); 1692 + path[0].p_depth = 0; 1693 + path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + 1694 + (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); 1695 + path[0].p_hdr->eh_max = cpu_to_le16(max_root); 1696 + 1697 + brelse(path[1].p_bh); 1698 + ext4_free_blocks(handle, inode, NULL, blk, 1, 1699 + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 1700 + } 1701 + 1702 + /* 1659 1703 * This function tries to merge the @ex extent to neighbours in the tree. 1660 1704 * return 1 if merge left else 0. 1661 1705 */ 1662 - static int ext4_ext_try_to_merge(struct inode *inode, 1706 + static void ext4_ext_try_to_merge(handle_t *handle, 1707 + struct inode *inode, 1663 1708 struct ext4_ext_path *path, 1664 1709 struct ext4_extent *ex) { 1665 1710 struct ext4_extent_header *eh; 1666 1711 unsigned int depth; 1667 1712 int merge_done = 0; 1668 - int ret = 0; 1669 1713 1670 1714 depth = ext_depth(inode); 1671 1715 BUG_ON(path[depth].p_hdr == NULL); ··· 1719 1675 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); 1720 1676 1721 1677 if (!merge_done) 1722 - ret = ext4_ext_try_to_merge_right(inode, path, ex); 1678 + (void) ext4_ext_try_to_merge_right(inode, path, ex); 1723 1679 1724 - return ret; 1680 + ext4_ext_try_to_merge_up(handle, inode, path); 1725 1681 } 1726 1682 1727 1683 /* ··· 1937 1893 merge: 1938 1894 /* try to merge extents */ 1939 1895 if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) 1940 - ext4_ext_try_to_merge(inode, path, nearex); 1896 + ext4_ext_try_to_merge(handle, inode, path, nearex); 1941 1897 1942 1898 1943 1899 /* time to correct all indexes above */ ··· 1945 1901 if (err) 1946 1902 goto cleanup; 1947 1903 1948 - err = ext4_ext_dirty(handle, inode, path + depth); 1904 + err = ext4_ext_dirty(handle, inode, path + path->p_depth); 1949 1905 1950 1906 cleanup: 1951 1907 if (npath) { ··· 2136 2092 } 2137 2093 2138 2094 /* 2139 - * ext4_ext_check_cache() 2095 + * ext4_ext_in_cache() 2140 2096 * Checks to see if the given block is in the cache. 2141 2097 * If it is, the cached extent is stored in the given 2142 - * cache extent pointer. If the cached extent is a hole, 2143 - * this routine should be used instead of 2144 - * ext4_ext_in_cache if the calling function needs to 2145 - * know the size of the hole. 2098 + * cache extent pointer. 2146 2099 * 2147 2100 * @inode: The files inode 2148 2101 * @block: The block to look for in the cache ··· 2148 2107 * 2149 2108 * Return 0 if cache is invalid; 1 if the cache is valid 2150 2109 */ 2151 - static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block, 2152 - struct ext4_ext_cache *ex){ 2110 + static int 2111 + ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, 2112 + struct ext4_extent *ex) 2113 + { 2153 2114 struct ext4_ext_cache *cex; 2154 2115 struct ext4_sb_info *sbi; 2155 2116 int ret = 0; ··· 2168 2125 goto errout; 2169 2126 2170 2127 if (in_range(block, cex->ec_block, cex->ec_len)) { 2171 - memcpy(ex, cex, sizeof(struct ext4_ext_cache)); 2128 + ex->ee_block = cpu_to_le32(cex->ec_block); 2129 + ext4_ext_store_pblock(ex, cex->ec_start); 2130 + ex->ee_len = cpu_to_le16(cex->ec_len); 2172 2131 ext_debug("%u cached by %u:%u:%llu\n", 2173 2132 block, 2174 2133 cex->ec_block, cex->ec_len, cex->ec_start); ··· 2181 2136 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 2182 2137 return ret; 2183 2138 } 2184 - 2185 - /* 2186 - * ext4_ext_in_cache() 2187 - * Checks to see if the given block is in the cache. 2188 - * If it is, the cached extent is stored in the given 2189 - * extent pointer. 2190 - * 2191 - * @inode: The files inode 2192 - * @block: The block to look for in the cache 2193 - * @ex: Pointer where the cached extent will be stored 2194 - * if it contains block 2195 - * 2196 - * Return 0 if cache is invalid; 1 if the cache is valid 2197 - */ 2198 - static int 2199 - ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, 2200 - struct ext4_extent *ex) 2201 - { 2202 - struct ext4_ext_cache cex; 2203 - int ret = 0; 2204 - 2205 - if (ext4_ext_check_cache(inode, block, &cex)) { 2206 - ex->ee_block = cpu_to_le32(cex.ec_block); 2207 - ext4_ext_store_pblock(ex, cex.ec_start); 2208 - ex->ee_len = cpu_to_le16(cex.ec_len); 2209 - ret = 1; 2210 - } 2211 - 2212 - return ret; 2213 - } 2214 - 2215 2139 2216 2140 /* 2217 2141 * ext4_ext_rm_idx: ··· 2288 2274 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2289 2275 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2290 2276 ext4_fsblk_t pblk; 2291 - int flags = EXT4_FREE_BLOCKS_FORGET; 2277 + int flags = 0; 2292 2278 2293 2279 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2294 - flags |= EXT4_FREE_BLOCKS_METADATA; 2280 + flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; 2281 + else if (ext4_should_journal_data(inode)) 2282 + flags |= EXT4_FREE_BLOCKS_FORGET; 2283 + 2295 2284 /* 2296 2285 * For bigalloc file systems, we never free a partial cluster 2297 2286 * at the beginning of the extent. Instead, we make a note ··· 2589 2572 struct ext4_ext_path *path = NULL; 2590 2573 ext4_fsblk_t partial_cluster = 0; 2591 2574 handle_t *handle; 2592 - int i = 0, err; 2575 + int i = 0, err = 0; 2593 2576 2594 2577 ext_debug("truncate since %u to %u\n", start, end); 2595 2578 ··· 2621 2604 return PTR_ERR(path); 2622 2605 } 2623 2606 depth = ext_depth(inode); 2607 + /* Leaf not may not exist only if inode has no blocks at all */ 2624 2608 ex = path[depth].p_ext; 2625 2609 if (!ex) { 2626 - ext4_ext_drop_refs(path); 2627 - kfree(path); 2628 - path = NULL; 2629 - goto cont; 2610 + if (depth) { 2611 + EXT4_ERROR_INODE(inode, 2612 + "path[%d].p_hdr == NULL", 2613 + depth); 2614 + err = -EIO; 2615 + } 2616 + goto out; 2630 2617 } 2631 2618 2632 2619 ee_block = le32_to_cpu(ex->ee_block); ··· 2662 2641 goto out; 2663 2642 } 2664 2643 } 2665 - cont: 2666 - 2667 2644 /* 2668 2645 * We start scanning from right side, freeing all the blocks 2669 2646 * after i_size and walking into the tree depth-wise. ··· 2943 2924 ext4_ext_mark_initialized(ex); 2944 2925 2945 2926 if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) 2946 - ext4_ext_try_to_merge(inode, path, ex); 2927 + ext4_ext_try_to_merge(handle, inode, path, ex); 2947 2928 2948 - err = ext4_ext_dirty(handle, inode, path + depth); 2929 + err = ext4_ext_dirty(handle, inode, path + path->p_depth); 2949 2930 goto out; 2950 2931 } 2951 2932 ··· 2977 2958 goto fix_extent_len; 2978 2959 /* update the extent length and mark as initialized */ 2979 2960 ex->ee_len = cpu_to_le16(ee_len); 2980 - ext4_ext_try_to_merge(inode, path, ex); 2981 - err = ext4_ext_dirty(handle, inode, path + depth); 2961 + ext4_ext_try_to_merge(handle, inode, path, ex); 2962 + err = ext4_ext_dirty(handle, inode, path + path->p_depth); 2982 2963 goto out; 2983 2964 } else if (err) 2984 2965 goto fix_extent_len; ··· 3060 3041 return err ? err : map->m_len; 3061 3042 } 3062 3043 3063 - #define EXT4_EXT_ZERO_LEN 7 3064 3044 /* 3065 3045 * This function is called by ext4_ext_map_blocks() if someone tries to write 3066 3046 * to an uninitialized extent. It may result in splitting the uninitialized ··· 3085 3067 struct ext4_map_blocks *map, 3086 3068 struct ext4_ext_path *path) 3087 3069 { 3070 + struct ext4_sb_info *sbi; 3088 3071 struct ext4_extent_header *eh; 3089 3072 struct ext4_map_blocks split_map; 3090 3073 struct ext4_extent zero_ex; 3091 3074 struct ext4_extent *ex; 3092 3075 ext4_lblk_t ee_block, eof_block; 3093 3076 unsigned int ee_len, depth; 3094 - int allocated; 3077 + int allocated, max_zeroout = 0; 3095 3078 int err = 0; 3096 3079 int split_flag = 0; 3097 3080 ··· 3100 3081 "block %llu, max_blocks %u\n", inode->i_ino, 3101 3082 (unsigned long long)map->m_lblk, map->m_len); 3102 3083 3084 + sbi = EXT4_SB(inode->i_sb); 3103 3085 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 3104 3086 inode->i_sb->s_blocksize_bits; 3105 3087 if (eof_block < map->m_lblk + map->m_len) ··· 3200 3180 */ 3201 3181 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; 3202 3182 3203 - /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ 3204 - if (ee_len <= 2*EXT4_EXT_ZERO_LEN && 3205 - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { 3183 + if (EXT4_EXT_MAY_ZEROOUT & split_flag) 3184 + max_zeroout = sbi->s_extent_max_zeroout_kb >> 3185 + inode->i_sb->s_blocksize_bits; 3186 + 3187 + /* If extent is less than s_max_zeroout_kb, zeroout directly */ 3188 + if (max_zeroout && (ee_len <= max_zeroout)) { 3206 3189 err = ext4_ext_zeroout(inode, ex); 3207 3190 if (err) 3208 3191 goto out; ··· 3214 3191 if (err) 3215 3192 goto out; 3216 3193 ext4_ext_mark_initialized(ex); 3217 - ext4_ext_try_to_merge(inode, path, ex); 3218 - err = ext4_ext_dirty(handle, inode, path + depth); 3194 + ext4_ext_try_to_merge(handle, inode, path, ex); 3195 + err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3219 3196 goto out; 3220 3197 } 3221 3198 ··· 3229 3206 split_map.m_lblk = map->m_lblk; 3230 3207 split_map.m_len = map->m_len; 3231 3208 3232 - if (allocated > map->m_len) { 3233 - if (allocated <= EXT4_EXT_ZERO_LEN && 3234 - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { 3209 + if (max_zeroout && (allocated > map->m_len)) { 3210 + if (allocated <= max_zeroout) { 3235 3211 /* case 3 */ 3236 3212 zero_ex.ee_block = 3237 3213 cpu_to_le32(map->m_lblk); ··· 3242 3220 goto out; 3243 3221 split_map.m_lblk = map->m_lblk; 3244 3222 split_map.m_len = allocated; 3245 - } else if ((map->m_lblk - ee_block + map->m_len < 3246 - EXT4_EXT_ZERO_LEN) && 3247 - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { 3223 + } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) { 3248 3224 /* case 2 */ 3249 3225 if (map->m_lblk != ee_block) { 3250 3226 zero_ex.ee_block = ex->ee_block; ··· 3262 3242 } 3263 3243 3264 3244 allocated = ext4_split_extent(handle, inode, path, 3265 - &split_map, split_flag, 0); 3245 + &split_map, split_flag, 0); 3266 3246 if (allocated < 0) 3267 3247 err = allocated; 3268 3248 ··· 3276 3256 * to an uninitialized extent. 3277 3257 * 3278 3258 * Writing to an uninitialized extent may result in splitting the uninitialized 3279 - * extent into multiple /initialized uninitialized extents (up to three) 3259 + * extent into multiple initialized/uninitialized extents (up to three) 3280 3260 * There are three possibilities: 3281 3261 * a> There is no split required: Entire extent should be uninitialized 3282 3262 * b> Splits in two extents: Write is happening at either end of the extent ··· 3353 3333 /* note: ext4_ext_correct_indexes() isn't needed here because 3354 3334 * borders are not changed 3355 3335 */ 3356 - ext4_ext_try_to_merge(inode, path, ex); 3336 + ext4_ext_try_to_merge(handle, inode, path, ex); 3357 3337 3358 3338 /* Mark modified extent as dirty */ 3359 - err = ext4_ext_dirty(handle, inode, path + depth); 3339 + err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3360 3340 out: 3361 3341 ext4_ext_show_leaf(inode, path); 3362 3342 return err; ··· 3620 3600 { 3621 3601 int ret = 0; 3622 3602 int err = 0; 3623 - ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3603 + ext4_io_end_t *io = ext4_inode_aio(inode); 3624 3604 3625 3605 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical " 3626 3606 "block %llu, max_blocks %u, flags %x, allocated %u\n", ··· 3635 3615 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3636 3616 ret = ext4_split_unwritten_extents(handle, inode, map, 3637 3617 path, flags); 3618 + if (ret <= 0) 3619 + goto out; 3638 3620 /* 3639 3621 * Flag the inode(non aio case) or end_io struct (aio case) 3640 3622 * that this IO needs to conversion to written when IO is ··· 3880 3858 unsigned int allocated = 0, offset = 0; 3881 3859 unsigned int allocated_clusters = 0; 3882 3860 struct ext4_allocation_request ar; 3883 - ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3861 + ext4_io_end_t *io = ext4_inode_aio(inode); 3884 3862 ext4_lblk_t cluster_offset; 3863 + int set_unwritten = 0; 3885 3864 3886 3865 ext_debug("blocks %u/%u requested for inode %lu\n", 3887 3866 map->m_lblk, map->m_len, inode->i_ino); ··· 4105 4082 * For non asycn direct IO case, flag the inode state 4106 4083 * that we need to perform conversion when IO is done. 4107 4084 */ 4108 - if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 4109 - if (io) 4110 - ext4_set_io_unwritten_flag(inode, io); 4111 - else 4112 - ext4_set_inode_state(inode, 4113 - EXT4_STATE_DIO_UNWRITTEN); 4114 - } 4085 + if ((flags & EXT4_GET_BLOCKS_PRE_IO)) 4086 + set_unwritten = 1; 4115 4087 if (ext4_should_dioread_nolock(inode)) 4116 4088 map->m_flags |= EXT4_MAP_UNINIT; 4117 4089 } ··· 4118 4100 if (!err) 4119 4101 err = ext4_ext_insert_extent(handle, inode, path, 4120 4102 &newex, flags); 4103 + 4104 + if (!err && set_unwritten) { 4105 + if (io) 4106 + ext4_set_io_unwritten_flag(inode, io); 4107 + else 4108 + ext4_set_inode_state(inode, 4109 + EXT4_STATE_DIO_UNWRITTEN); 4110 + } 4111 + 4121 4112 if (err && free_on_err) { 4122 4113 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? 4123 4114 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; ··· 4268 4241 * finish any pending end_io work so we won't run the risk of 4269 4242 * converting any truncated blocks to initialized later 4270 4243 */ 4271 - ext4_flush_completed_IO(inode); 4244 + ext4_flush_unwritten_io(inode); 4272 4245 4273 4246 /* 4274 4247 * probably first extent we're gonna free will be last in block ··· 4796 4769 loff_t first_page_offset, last_page_offset; 4797 4770 int credits, err = 0; 4798 4771 4772 + /* 4773 + * Write out all dirty pages to avoid race conditions 4774 + * Then release them. 4775 + */ 4776 + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 4777 + err = filemap_write_and_wait_range(mapping, 4778 + offset, offset + length - 1); 4779 + 4780 + if (err) 4781 + return err; 4782 + } 4783 + 4784 + mutex_lock(&inode->i_mutex); 4785 + /* It's not possible punch hole on append only file */ 4786 + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { 4787 + err = -EPERM; 4788 + goto out_mutex; 4789 + } 4790 + if (IS_SWAPFILE(inode)) { 4791 + err = -ETXTBSY; 4792 + goto out_mutex; 4793 + } 4794 + 4799 4795 /* No need to punch hole beyond i_size */ 4800 4796 if (offset >= inode->i_size) 4801 - return 0; 4797 + goto out_mutex; 4802 4798 4803 4799 /* 4804 4800 * If the hole extends beyond i_size, set the hole ··· 4839 4789 first_page_offset = first_page << PAGE_CACHE_SHIFT; 4840 4790 last_page_offset = last_page << PAGE_CACHE_SHIFT; 4841 4791 4842 - /* 4843 - * Write out all dirty pages to avoid race conditions 4844 - * Then release them. 4845 - */ 4846 - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 4847 - err = filemap_write_and_wait_range(mapping, 4848 - offset, offset + length - 1); 4849 - 4850 - if (err) 4851 - return err; 4852 - } 4853 - 4854 4792 /* Now release the pages */ 4855 4793 if (last_page_offset > first_page_offset) { 4856 4794 truncate_pagecache_range(inode, first_page_offset, 4857 4795 last_page_offset - 1); 4858 4796 } 4859 4797 4860 - /* finish any pending end_io work */ 4861 - ext4_flush_completed_IO(inode); 4798 + /* Wait all existing dio workers, newcomers will block on i_mutex */ 4799 + ext4_inode_block_unlocked_dio(inode); 4800 + err = ext4_flush_unwritten_io(inode); 4801 + if (err) 4802 + goto out_dio; 4803 + inode_dio_wait(inode); 4862 4804 4863 4805 credits = ext4_writepage_trans_blocks(inode); 4864 4806 handle = ext4_journal_start(inode, credits); 4865 - if (IS_ERR(handle)) 4866 - return PTR_ERR(handle); 4807 + if (IS_ERR(handle)) { 4808 + err = PTR_ERR(handle); 4809 + goto out_dio; 4810 + } 4867 4811 4868 - err = ext4_orphan_add(handle, inode); 4869 - if (err) 4870 - goto out; 4871 4812 4872 4813 /* 4873 4814 * Now we need to zero out the non-page-aligned data in the ··· 4944 4903 up_write(&EXT4_I(inode)->i_data_sem); 4945 4904 4946 4905 out: 4947 - ext4_orphan_del(handle, inode); 4948 4906 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4949 4907 ext4_mark_inode_dirty(handle, inode); 4950 4908 ext4_journal_stop(handle); 4909 + out_dio: 4910 + ext4_inode_resume_unlocked_dio(inode); 4911 + out_mutex: 4912 + mutex_unlock(&inode->i_mutex); 4951 4913 return err; 4952 4914 } 4953 4915 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,

+3 -3

fs/ext4/file.c

··· 55 55 return 0; 56 56 } 57 57 58 - static void ext4_aiodio_wait(struct inode *inode) 58 + void ext4_unwritten_wait(struct inode *inode) 59 59 { 60 60 wait_queue_head_t *wq = ext4_ioend_wq(inode); 61 61 62 - wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0)); 62 + wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)); 63 63 } 64 64 65 65 /* ··· 116 116 "performance will be poor.", 117 117 inode->i_ino, current->comm); 118 118 mutex_lock(ext4_aio_mutex(inode)); 119 - ext4_aiodio_wait(inode); 119 + ext4_unwritten_wait(inode); 120 120 } 121 121 122 122 BUG_ON(iocb->ki_pos != pos);

+7 -85

fs/ext4/fsync.c

··· 34 34 35 35 #include <trace/events/ext4.h> 36 36 37 - static void dump_completed_IO(struct inode * inode) 38 - { 39 - #ifdef EXT4FS_DEBUG 40 - struct list_head *cur, *before, *after; 41 - ext4_io_end_t *io, *io0, *io1; 42 - unsigned long flags; 43 - 44 - if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ 45 - ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); 46 - return; 47 - } 48 - 49 - ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); 50 - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); 51 - list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ 52 - cur = &io->list; 53 - before = cur->prev; 54 - io0 = container_of(before, ext4_io_end_t, list); 55 - after = cur->next; 56 - io1 = container_of(after, ext4_io_end_t, list); 57 - 58 - ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", 59 - io, inode->i_ino, io0, io1); 60 - } 61 - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); 62 - #endif 63 - } 64 - 65 - /* 66 - * This function is called from ext4_sync_file(). 67 - * 68 - * When IO is completed, the work to convert unwritten extents to 69 - * written is queued on workqueue but may not get immediately 70 - * scheduled. When fsync is called, we need to ensure the 71 - * conversion is complete before fsync returns. 72 - * The inode keeps track of a list of pending/completed IO that 73 - * might needs to do the conversion. This function walks through 74 - * the list and convert the related unwritten extents for completed IO 75 - * to written. 76 - * The function return the number of pending IOs on success. 77 - */ 78 - int ext4_flush_completed_IO(struct inode *inode) 79 - { 80 - ext4_io_end_t *io; 81 - struct ext4_inode_info *ei = EXT4_I(inode); 82 - unsigned long flags; 83 - int ret = 0; 84 - int ret2 = 0; 85 - 86 - dump_completed_IO(inode); 87 - spin_lock_irqsave(&ei->i_completed_io_lock, flags); 88 - while (!list_empty(&ei->i_completed_io_list)){ 89 - io = list_entry(ei->i_completed_io_list.next, 90 - ext4_io_end_t, list); 91 - list_del_init(&io->list); 92 - io->flag |= EXT4_IO_END_IN_FSYNC; 93 - /* 94 - * Calling ext4_end_io_nolock() to convert completed 95 - * IO to written. 96 - * 97 - * When ext4_sync_file() is called, run_queue() may already 98 - * about to flush the work corresponding to this io structure. 99 - * It will be upset if it founds the io structure related 100 - * to the work-to-be schedule is freed. 101 - * 102 - * Thus we need to keep the io structure still valid here after 103 - * conversion finished. The io structure has a flag to 104 - * avoid double converting from both fsync and background work 105 - * queue work. 106 - */ 107 - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 108 - ret = ext4_end_io_nolock(io); 109 - if (ret < 0) 110 - ret2 = ret; 111 - spin_lock_irqsave(&ei->i_completed_io_lock, flags); 112 - io->flag &= ~EXT4_IO_END_IN_FSYNC; 113 - } 114 - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 115 - return (ret2 < 0) ? ret2 : 0; 116 - } 117 - 118 37 /* 119 38 * If we're not journaling and this is a just-created file, we have to 120 39 * sync our parent directory (if it was freshly created) since ··· 122 203 struct inode *inode = file->f_mapping->host; 123 204 struct ext4_inode_info *ei = EXT4_I(inode); 124 205 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 125 - int ret; 206 + int ret, err; 126 207 tid_t commit_tid; 127 208 bool needs_barrier = false; 128 209 ··· 138 219 if (inode->i_sb->s_flags & MS_RDONLY) 139 220 goto out; 140 221 141 - ret = ext4_flush_completed_IO(inode); 222 + ret = ext4_flush_unwritten_io(inode); 142 223 if (ret < 0) 143 224 goto out; 144 225 ··· 174 255 needs_barrier = true; 175 256 jbd2_log_start_commit(journal, commit_tid); 176 257 ret = jbd2_log_wait_commit(journal, commit_tid); 177 - if (needs_barrier) 178 - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 258 + if (needs_barrier) { 259 + err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 260 + if (!ret) 261 + ret = err; 262 + } 179 263 out: 180 264 mutex_unlock(&inode->i_mutex); 181 265 trace_ext4_sync_file_exit(inode, ret);

+9

fs/ext4/ialloc.c

··· 697 697 if (!gdp) 698 698 goto fail; 699 699 700 + /* 701 + * Check free inodes count before loading bitmap. 702 + */ 703 + if (ext4_free_inodes_count(sb, gdp) == 0) { 704 + if (++group == ngroups) 705 + group = 0; 706 + continue; 707 + } 708 + 700 709 brelse(inode_bitmap_bh); 701 710 inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); 702 711 if (!inode_bitmap_bh)

+16 -2

fs/ext4/indirect.c

··· 807 807 808 808 retry: 809 809 if (rw == READ && ext4_should_dioread_nolock(inode)) { 810 - if (unlikely(!list_empty(&ei->i_completed_io_list))) { 810 + if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) { 811 811 mutex_lock(&inode->i_mutex); 812 - ext4_flush_completed_IO(inode); 812 + ext4_flush_unwritten_io(inode); 813 813 mutex_unlock(&inode->i_mutex); 814 + } 815 + /* 816 + * Nolock dioread optimization may be dynamically disabled 817 + * via ext4_inode_block_unlocked_dio(). Check inode's state 818 + * while holding extra i_dio_count ref. 819 + */ 820 + atomic_inc(&inode->i_dio_count); 821 + smp_mb(); 822 + if (unlikely(ext4_test_inode_state(inode, 823 + EXT4_STATE_DIOREAD_LOCK))) { 824 + inode_dio_done(inode); 825 + goto locked; 814 826 } 815 827 ret = __blockdev_direct_IO(rw, iocb, inode, 816 828 inode->i_sb->s_bdev, iov, 817 829 offset, nr_segs, 818 830 ext4_get_block, NULL, NULL, 0); 831 + inode_dio_done(inode); 819 832 } else { 833 + locked: 820 834 ret = blockdev_direct_IO(rw, iocb, inode, iov, 821 835 offset, nr_segs, ext4_get_block); 822 836

+42 -41

fs/ext4/inode.c

··· 732 732 err = ext4_map_blocks(handle, inode, &map, 733 733 create ? EXT4_GET_BLOCKS_CREATE : 0); 734 734 735 + /* ensure we send some value back into *errp */ 736 + *errp = 0; 737 + 735 738 if (err < 0) 736 739 *errp = err; 737 740 if (err <= 0) 738 741 return NULL; 739 - *errp = 0; 740 742 741 743 bh = sb_getblk(inode->i_sb, map.m_pblk); 742 744 if (!bh) { ··· 1956 1954 return ret; 1957 1955 } 1958 1956 1959 - static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); 1960 - static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); 1961 - 1962 1957 /* 1963 1958 * Note that we don't need to start a transaction unless we're journaling data 1964 1959 * because we should have holes filled from ext4_page_mkwrite(). We even don't ··· 2462 2463 free_blocks = EXT4_C2B(sbi, 2463 2464 percpu_counter_read_positive(&sbi->s_freeclusters_counter)); 2464 2465 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); 2466 + /* 2467 + * Start pushing delalloc when 1/2 of free blocks are dirty. 2468 + */ 2469 + if (dirty_blocks && (free_blocks < 2 * dirty_blocks) && 2470 + !writeback_in_progress(sb->s_bdi) && 2471 + down_read_trylock(&sb->s_umount)) { 2472 + writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); 2473 + up_read(&sb->s_umount); 2474 + } 2475 + 2465 2476 if (2 * free_blocks < 3 * dirty_blocks || 2466 2477 free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { 2467 2478 /* ··· 2480 2471 */ 2481 2472 return 1; 2482 2473 } 2483 - /* 2484 - * Even if we don't switch but are nearing capacity, 2485 - * start pushing delalloc when 1/2 of free blocks are dirty. 2486 - */ 2487 - if (free_blocks < 2 * dirty_blocks) 2488 - writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE); 2489 - 2490 2474 return 0; 2491 2475 } 2492 2476 ··· 2881 2879 { 2882 2880 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 2883 2881 ext4_io_end_t *io_end = iocb->private; 2884 - struct workqueue_struct *wq; 2885 - unsigned long flags; 2886 - struct ext4_inode_info *ei; 2887 2882 2888 2883 /* if not async direct IO or dio with 0 bytes write, just return */ 2889 2884 if (!io_end || !size) ··· 2909 2910 io_end->iocb = iocb; 2910 2911 io_end->result = ret; 2911 2912 } 2912 - wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 2913 2913 2914 - /* Add the io_end to per-inode completed aio dio list*/ 2915 - ei = EXT4_I(io_end->inode); 2916 - spin_lock_irqsave(&ei->i_completed_io_lock, flags); 2917 - list_add_tail(&io_end->list, &ei->i_completed_io_list); 2918 - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 2919 - 2920 - /* queue the work to convert unwritten extents to written */ 2921 - queue_work(wq, &io_end->work); 2914 + ext4_add_complete_io(io_end); 2922 2915 } 2923 2916 2924 2917 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) 2925 2918 { 2926 2919 ext4_io_end_t *io_end = bh->b_private; 2927 - struct workqueue_struct *wq; 2928 2920 struct inode *inode; 2929 - unsigned long flags; 2930 2921 2931 2922 if (!test_clear_buffer_uninit(bh) || !io_end) 2932 2923 goto out; ··· 2935 2946 */ 2936 2947 inode = io_end->inode; 2937 2948 ext4_set_io_unwritten_flag(inode, io_end); 2938 - 2939 - /* Add the io_end to per-inode completed io list*/ 2940 - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); 2941 - list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); 2942 - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); 2943 - 2944 - wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; 2945 - /* queue the work to convert unwritten extents to written */ 2946 - queue_work(wq, &io_end->work); 2949 + ext4_add_complete_io(io_end); 2947 2950 out: 2948 2951 bh->b_private = NULL; 2949 2952 bh->b_end_io = NULL; ··· 3010 3029 overwrite = *((int *)iocb->private); 3011 3030 3012 3031 if (overwrite) { 3032 + atomic_inc(&inode->i_dio_count); 3013 3033 down_read(&EXT4_I(inode)->i_data_sem); 3014 3034 mutex_unlock(&inode->i_mutex); 3015 3035 } ··· 3036 3054 * hook to the iocb. 3037 3055 */ 3038 3056 iocb->private = NULL; 3039 - EXT4_I(inode)->cur_aio_dio = NULL; 3057 + ext4_inode_aio_set(inode, NULL); 3040 3058 if (!is_sync_kiocb(iocb)) { 3041 3059 ext4_io_end_t *io_end = 3042 3060 ext4_init_io_end(inode, GFP_NOFS); ··· 3053 3071 * is a unwritten extents needs to be converted 3054 3072 * when IO is completed. 3055 3073 */ 3056 - EXT4_I(inode)->cur_aio_dio = iocb->private; 3074 + ext4_inode_aio_set(inode, io_end); 3057 3075 } 3058 3076 3059 3077 if (overwrite) ··· 3073 3091 NULL, 3074 3092 DIO_LOCKING); 3075 3093 if (iocb->private) 3076 - EXT4_I(inode)->cur_aio_dio = NULL; 3094 + ext4_inode_aio_set(inode, NULL); 3077 3095 /* 3078 3096 * The io_end structure takes a reference to the inode, 3079 3097 * that structure needs to be destroyed and the ··· 3108 3126 retake_lock: 3109 3127 /* take i_mutex locking again if we do a ovewrite dio */ 3110 3128 if (overwrite) { 3129 + inode_dio_done(inode); 3111 3130 up_read(&EXT4_I(inode)->i_data_sem); 3112 3131 mutex_lock(&inode->i_mutex); 3113 3132 } ··· 4035 4052 struct ext4_inode_info *ei = EXT4_I(inode); 4036 4053 struct buffer_head *bh = iloc->bh; 4037 4054 int err = 0, rc, block; 4055 + int need_datasync = 0; 4038 4056 uid_t i_uid; 4039 4057 gid_t i_gid; 4040 4058 ··· 4086 4102 raw_inode->i_file_acl_high = 4087 4103 cpu_to_le16(ei->i_file_acl >> 32); 4088 4104 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); 4089 - ext4_isize_set(raw_inode, ei->i_disksize); 4105 + if (ei->i_disksize != ext4_isize(raw_inode)) { 4106 + ext4_isize_set(raw_inode, ei->i_disksize); 4107 + need_datasync = 1; 4108 + } 4090 4109 if (ei->i_disksize > 0x7fffffffULL) { 4091 4110 struct super_block *sb = inode->i_sb; 4092 4111 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, ··· 4142 4155 err = rc; 4143 4156 ext4_clear_inode_state(inode, EXT4_STATE_NEW); 4144 4157 4145 - ext4_update_inode_fsync_trans(handle, inode, 0); 4158 + ext4_update_inode_fsync_trans(handle, inode, need_datasync); 4146 4159 out_brelse: 4147 4160 brelse(bh); 4148 4161 ext4_std_error(inode->i_sb, err); ··· 4285 4298 } 4286 4299 4287 4300 if (attr->ia_valid & ATTR_SIZE) { 4288 - inode_dio_wait(inode); 4289 4301 4290 4302 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 4291 4303 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ··· 4333 4347 } 4334 4348 4335 4349 if (attr->ia_valid & ATTR_SIZE) { 4336 - if (attr->ia_size != i_size_read(inode)) 4350 + if (attr->ia_size != i_size_read(inode)) { 4337 4351 truncate_setsize(inode, attr->ia_size); 4352 + /* Inode size will be reduced, wait for dio in flight. 4353 + * Temporarily disable dioread_nolock to prevent 4354 + * livelock. */ 4355 + if (orphan) { 4356 + ext4_inode_block_unlocked_dio(inode); 4357 + inode_dio_wait(inode); 4358 + ext4_inode_resume_unlocked_dio(inode); 4359 + } 4360 + } 4338 4361 ext4_truncate(inode); 4339 4362 } 4340 4363 ··· 4722 4727 return err; 4723 4728 } 4724 4729 4730 + /* Wait for all existing dio workers */ 4731 + ext4_inode_block_unlocked_dio(inode); 4732 + inode_dio_wait(inode); 4733 + 4725 4734 jbd2_journal_lock_updates(journal); 4726 4735 4727 4736 /* ··· 4745 4746 ext4_set_aops(inode); 4746 4747 4747 4748 jbd2_journal_unlock_updates(journal); 4749 + ext4_inode_resume_unlocked_dio(inode); 4748 4750 4749 4751 /* Finally we can mark the inode as dirty. */ 4750 4752 ··· 4780 4780 int retries = 0; 4781 4781 4782 4782 sb_start_pagefault(inode->i_sb); 4783 + file_update_time(vma->vm_file); 4783 4784 /* Delalloc case is easy... */ 4784 4785 if (test_opt(inode->i_sb, DELALLOC) && 4785 4786 !ext4_should_journal_data(inode) &&

-22

fs/ext4/ioctl.c

··· 366 366 return -EOPNOTSUPP; 367 367 } 368 368 369 - if (EXT4_HAS_INCOMPAT_FEATURE(sb, 370 - EXT4_FEATURE_INCOMPAT_META_BG)) { 371 - ext4_msg(sb, KERN_ERR, 372 - "Online resizing not (yet) supported with meta_bg"); 373 - return -EOPNOTSUPP; 374 - } 375 - 376 369 if (copy_from_user(&n_blocks_count, (__u64 __user *)arg, 377 370 sizeof(__u64))) { 378 371 return -EFAULT; 379 - } 380 - 381 - if (n_blocks_count > MAX_32_NUM && 382 - !EXT4_HAS_INCOMPAT_FEATURE(sb, 383 - EXT4_FEATURE_INCOMPAT_64BIT)) { 384 - ext4_msg(sb, KERN_ERR, 385 - "File system only supports 32-bit block numbers"); 386 - return -EOPNOTSUPP; 387 372 } 388 373 389 374 err = ext4_resize_begin(sb); ··· 404 419 405 420 if (!blk_queue_discard(q)) 406 421 return -EOPNOTSUPP; 407 - 408 - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 409 - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { 410 - ext4_msg(sb, KERN_ERR, 411 - "FITRIM not supported with bigalloc"); 412 - return -EOPNOTSUPP; 413 - } 414 422 415 423 if (copy_from_user(&range, (struct fstrim_range __user *)arg, 416 424 sizeof(range)))

+66 -63

fs/ext4/mballoc.c

··· 24 24 #include "ext4_jbd2.h" 25 25 #include "mballoc.h" 26 26 #include <linux/debugfs.h> 27 + #include <linux/log2.h> 27 28 #include <linux/slab.h> 28 29 #include <trace/events/ext4.h> 29 30 ··· 1339 1338 mb_check_buddy(e4b); 1340 1339 } 1341 1340 1342 - static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, 1341 + static int mb_find_extent(struct ext4_buddy *e4b, int block, 1343 1342 int needed, struct ext4_free_extent *ex) 1344 1343 { 1345 1344 int next = block; 1346 - int max; 1345 + int max, order; 1347 1346 void *buddy; 1348 1347 1349 1348 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 1350 1349 BUG_ON(ex == NULL); 1351 1350 1352 - buddy = mb_find_buddy(e4b, order, &max); 1351 + buddy = mb_find_buddy(e4b, 0, &max); 1353 1352 BUG_ON(buddy == NULL); 1354 1353 BUG_ON(block >= max); 1355 1354 if (mb_test_bit(block, buddy)) { ··· 1359 1358 return 0; 1360 1359 } 1361 1360 1362 - /* FIXME dorp order completely ? */ 1363 - if (likely(order == 0)) { 1364 - /* find actual order */ 1365 - order = mb_find_order_for_block(e4b, block); 1366 - block = block >> order; 1367 - } 1361 + /* find actual order */ 1362 + order = mb_find_order_for_block(e4b, block); 1363 + block = block >> order; 1368 1364 1369 1365 ex->fe_len = 1 << order; 1370 1366 ex->fe_start = block << order; ··· 1547 1549 /* recheck chunk's availability - we don't know 1548 1550 * when it was found (within this lock-unlock 1549 1551 * period or not) */ 1550 - max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex); 1552 + max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex); 1551 1553 if (max >= gex->fe_len) { 1552 1554 ext4_mb_use_best_found(ac, e4b); 1553 1555 return; ··· 1639 1641 return err; 1640 1642 1641 1643 ext4_lock_group(ac->ac_sb, group); 1642 - max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex); 1644 + max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); 1643 1645 1644 1646 if (max > 0) { 1645 1647 ac->ac_b_ex = ex; ··· 1660 1662 int max; 1661 1663 int err; 1662 1664 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1665 + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 1663 1666 struct ext4_free_extent ex; 1664 1667 1665 1668 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) 1669 + return 0; 1670 + if (grp->bb_free == 0) 1666 1671 return 0; 1667 1672 1668 1673 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); ··· 1673 1672 return err; 1674 1673 1675 1674 ext4_lock_group(ac->ac_sb, group); 1676 - max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start, 1675 + max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, 1677 1676 ac->ac_g_ex.fe_len, &ex); 1678 1677 1679 1678 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ··· 1789 1788 break; 1790 1789 } 1791 1790 1792 - mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); 1791 + mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); 1793 1792 BUG_ON(ex.fe_len <= 0); 1794 1793 if (free < ex.fe_len) { 1795 1794 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, ··· 1841 1840 1842 1841 while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { 1843 1842 if (!mb_test_bit(i, bitmap)) { 1844 - max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); 1843 + max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); 1845 1844 if (max >= sbi->s_stripe) { 1846 1845 ac->ac_found++; 1847 1846 ac->ac_b_ex = ex; ··· 1863 1862 1864 1863 BUG_ON(cr < 0 || cr >= 4); 1865 1864 1865 + free = grp->bb_free; 1866 + if (free == 0) 1867 + return 0; 1868 + if (cr <= 2 && free < ac->ac_g_ex.fe_len) 1869 + return 0; 1870 + 1866 1871 /* We only do this if the grp has never been initialized */ 1867 1872 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 1868 1873 int ret = ext4_mb_init_group(ac->ac_sb, group); ··· 1876 1869 return 0; 1877 1870 } 1878 1871 1879 - free = grp->bb_free; 1880 1872 fragments = grp->bb_fragments; 1881 - if (free == 0) 1882 - return 0; 1883 1873 if (fragments == 0) 1884 1874 return 0; 1885 1875 ··· 2167 2163 return cachep; 2168 2164 } 2169 2165 2166 + /* 2167 + * Allocate the top-level s_group_info array for the specified number 2168 + * of groups 2169 + */ 2170 + int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) 2171 + { 2172 + struct ext4_sb_info *sbi = EXT4_SB(sb); 2173 + unsigned size; 2174 + struct ext4_group_info ***new_groupinfo; 2175 + 2176 + size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> 2177 + EXT4_DESC_PER_BLOCK_BITS(sb); 2178 + if (size <= sbi->s_group_info_size) 2179 + return 0; 2180 + 2181 + size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); 2182 + new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL); 2183 + if (!new_groupinfo) { 2184 + ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); 2185 + return -ENOMEM; 2186 + } 2187 + if (sbi->s_group_info) { 2188 + memcpy(new_groupinfo, sbi->s_group_info, 2189 + sbi->s_group_info_size * sizeof(*sbi->s_group_info)); 2190 + ext4_kvfree(sbi->s_group_info); 2191 + } 2192 + sbi->s_group_info = new_groupinfo; 2193 + sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); 2194 + ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 2195 + sbi->s_group_info_size); 2196 + return 0; 2197 + } 2198 + 2170 2199 /* Create and initialize ext4_group_info data for the given group. */ 2171 2200 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 2172 2201 struct ext4_group_desc *desc) ··· 2232 2195 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; 2233 2196 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); 2234 2197 2235 - meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); 2198 + meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL); 2236 2199 if (meta_group_info[i] == NULL) { 2237 2200 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); 2238 2201 goto exit_group_info; 2239 2202 } 2240 - memset(meta_group_info[i], 0, kmem_cache_size(cachep)); 2241 2203 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, 2242 2204 &(meta_group_info[i]->bb_state)); 2243 2205 ··· 2288 2252 ext4_group_t ngroups = ext4_get_groups_count(sb); 2289 2253 ext4_group_t i; 2290 2254 struct ext4_sb_info *sbi = EXT4_SB(sb); 2291 - struct ext4_super_block *es = sbi->s_es; 2292 - int num_meta_group_infos; 2293 - int num_meta_group_infos_max; 2294 - int array_size; 2255 + int err; 2295 2256 struct ext4_group_desc *desc; 2296 2257 struct kmem_cache *cachep; 2297 2258 2298 - /* This is the number of blocks used by GDT */ 2299 - num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 2300 - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); 2259 + err = ext4_mb_alloc_groupinfo(sb, ngroups); 2260 + if (err) 2261 + return err; 2301 2262 2302 - /* 2303 - * This is the total number of blocks used by GDT including 2304 - * the number of reserved blocks for GDT. 2305 - * The s_group_info array is allocated with this value 2306 - * to allow a clean online resize without a complex 2307 - * manipulation of pointer. 2308 - * The drawback is the unused memory when no resize 2309 - * occurs but it's very low in terms of pages 2310 - * (see comments below) 2311 - * Need to handle this properly when META_BG resizing is allowed 2312 - */ 2313 - num_meta_group_infos_max = num_meta_group_infos + 2314 - le16_to_cpu(es->s_reserved_gdt_blocks); 2315 - 2316 - /* 2317 - * array_size is the size of s_group_info array. We round it 2318 - * to the next power of two because this approximation is done 2319 - * internally by kmalloc so we can have some more memory 2320 - * for free here (e.g. may be used for META_BG resize). 2321 - */ 2322 - array_size = 1; 2323 - while (array_size < sizeof(*sbi->s_group_info) * 2324 - num_meta_group_infos_max) 2325 - array_size = array_size << 1; 2326 - /* An 8TB filesystem with 64-bit pointers requires a 4096 byte 2327 - * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. 2328 - * So a two level scheme suffices for now. */ 2329 - sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL); 2330 - if (sbi->s_group_info == NULL) { 2331 - ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); 2332 - return -ENOMEM; 2333 - } 2334 2263 sbi->s_buddy_cache = new_inode(sb); 2335 2264 if (sbi->s_buddy_cache == NULL) { 2336 2265 ext4_msg(sb, KERN_ERR, "can't get new inode"); ··· 2323 2322 cachep = get_groupinfo_cache(sb->s_blocksize_bits); 2324 2323 while (i-- > 0) 2325 2324 kmem_cache_free(cachep, ext4_get_group_info(sb, i)); 2326 - i = num_meta_group_infos; 2325 + i = sbi->s_group_info_size; 2327 2326 while (i-- > 0) 2328 2327 kfree(sbi->s_group_info[i]); 2329 2328 iput(sbi->s_buddy_cache); ··· 4009 4008 ext4_get_group_no_and_offset(sb, goal, &group, &block); 4010 4009 4011 4010 /* set up allocation goals */ 4012 - memset(ac, 0, sizeof(struct ext4_allocation_context)); 4013 4011 ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1); 4014 4012 ac->ac_status = AC_STATUS_CONTINUE; 4015 4013 ac->ac_sb = sb; ··· 4291 4291 } 4292 4292 } 4293 4293 4294 - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4294 + ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS); 4295 4295 if (!ac) { 4296 4296 ar->len = 0; 4297 4297 *errp = -ENOMEM; ··· 4657 4657 * with group lock held. generate_buddy look at 4658 4658 * them with group lock_held 4659 4659 */ 4660 + if (test_opt(sb, DISCARD)) 4661 + ext4_issue_discard(sb, block_group, bit, count); 4660 4662 ext4_lock_group(sb, block_group); 4661 4663 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 4662 4664 mb_free_blocks(inode, &e4b, bit, count_clusters); ··· 4990 4988 4991 4989 start = range->start >> sb->s_blocksize_bits; 4992 4990 end = start + (range->len >> sb->s_blocksize_bits) - 1; 4993 - minlen = range->minlen >> sb->s_blocksize_bits; 4991 + minlen = EXT4_NUM_B2C(EXT4_SB(sb), 4992 + range->minlen >> sb->s_blocksize_bits); 4994 4993 4995 4994 if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) || 4996 4995 unlikely(start >= max_blks)) ··· 5051 5048 atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); 5052 5049 5053 5050 out: 5054 - range->len = trimmed * sb->s_blocksize; 5051 + range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; 5055 5052 return ret; 5056 5053 }

-5

fs/ext4/mballoc.h

··· 65 65 #define MB_DEFAULT_MIN_TO_SCAN 10 66 66 67 67 /* 68 - * How many groups mballoc will scan looking for the best chunk 69 - */ 70 - #define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5 71 - 72 - /* 73 68 * with 'ext4_mb_stats' allocator will collect stats that will be 74 69 * shown at umount. The collecting costs though! 75 70 */

+315 -203

fs/ext4/move_extent.c

··· 141 141 } 142 142 143 143 /** 144 - * mext_check_null_inode - NULL check for two inodes 145 - * 146 - * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. 147 - */ 148 - static int 149 - mext_check_null_inode(struct inode *inode1, struct inode *inode2, 150 - const char *function, unsigned int line) 151 - { 152 - int ret = 0; 153 - 154 - if (inode1 == NULL) { 155 - __ext4_error(inode2->i_sb, function, line, 156 - "Both inodes should not be NULL: " 157 - "inode1 NULL inode2 %lu", inode2->i_ino); 158 - ret = -EIO; 159 - } else if (inode2 == NULL) { 160 - __ext4_error(inode1->i_sb, function, line, 161 - "Both inodes should not be NULL: " 162 - "inode1 %lu inode2 NULL", inode1->i_ino); 163 - ret = -EIO; 164 - } 165 - return ret; 166 - } 167 - 168 - /** 169 144 * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem 170 145 * 171 - * @orig_inode: original inode structure 172 - * @donor_inode: donor inode structure 173 - * Acquire write lock of i_data_sem of the two inodes (orig and donor) by 174 - * i_ino order. 146 + * Acquire write lock of i_data_sem of the two inodes 175 147 */ 176 148 static void 177 - double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) 149 + double_down_write_data_sem(struct inode *first, struct inode *second) 178 150 { 179 - struct inode *first = orig_inode, *second = donor_inode; 151 + if (first < second) { 152 + down_write(&EXT4_I(first)->i_data_sem); 153 + down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); 154 + } else { 155 + down_write(&EXT4_I(second)->i_data_sem); 156 + down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING); 180 157 181 - /* 182 - * Use the inode number to provide the stable locking order instead 183 - * of its address, because the C language doesn't guarantee you can 184 - * compare pointers that don't come from the same array. 185 - */ 186 - if (donor_inode->i_ino < orig_inode->i_ino) { 187 - first = donor_inode; 188 - second = orig_inode; 189 158 } 190 - 191 - down_write(&EXT4_I(first)->i_data_sem); 192 - down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); 193 159 } 194 160 195 161 /** ··· 570 604 diff = donor_off - le32_to_cpu(tmp_dext->ee_block); 571 605 572 606 ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff); 573 - tmp_dext->ee_block = 574 - cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff); 575 - tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff); 607 + le32_add_cpu(&tmp_dext->ee_block, diff); 608 + le16_add_cpu(&tmp_dext->ee_len, -diff); 576 609 577 610 if (max_count < ext4_ext_get_actual_len(tmp_dext)) 578 611 tmp_dext->ee_len = cpu_to_le16(max_count); ··· 591 626 copy_extent_status(&dext_old, tmp_oext); 592 627 593 628 return 0; 629 + } 630 + 631 + /** 632 + * mext_check_coverage - Check that all extents in range has the same type 633 + * 634 + * @inode: inode in question 635 + * @from: block offset of inode 636 + * @count: block count to be checked 637 + * @uninit: extents expected to be uninitialized 638 + * @err: pointer to save error value 639 + * 640 + * Return 1 if all extents in range has expected type, and zero otherwise. 641 + */ 642 + static int 643 + mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, 644 + int uninit, int *err) 645 + { 646 + struct ext4_ext_path *path = NULL; 647 + struct ext4_extent *ext; 648 + ext4_lblk_t last = from + count; 649 + while (from < last) { 650 + *err = get_ext_path(inode, from, &path); 651 + if (*err) 652 + return 0; 653 + ext = path[ext_depth(inode)].p_ext; 654 + if (!ext) { 655 + ext4_ext_drop_refs(path); 656 + return 0; 657 + } 658 + if (uninit != ext4_ext_is_uninitialized(ext)) { 659 + ext4_ext_drop_refs(path); 660 + return 0; 661 + } 662 + from += ext4_ext_get_actual_len(ext); 663 + ext4_ext_drop_refs(path); 664 + } 665 + return 1; 594 666 } 595 667 596 668 /** ··· 664 662 int depth; 665 663 int replaced_count = 0; 666 664 int dext_alen; 667 - 668 - /* Protect extent trees against block allocations via delalloc */ 669 - double_down_write_data_sem(orig_inode, donor_inode); 670 665 671 666 /* Get the original extent for the block "orig_off" */ 672 667 *err = get_ext_path(orig_inode, orig_off, &orig_path); ··· 763 764 ext4_ext_invalidate_cache(orig_inode); 764 765 ext4_ext_invalidate_cache(donor_inode); 765 766 766 - double_up_write_data_sem(orig_inode, donor_inode); 767 - 768 767 return replaced_count; 768 + } 769 + 770 + /** 771 + * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2 772 + * 773 + * @inode1: the inode structure 774 + * @inode2: the inode structure 775 + * @index: page index 776 + * @page: result page vector 777 + * 778 + * Grab two locked pages for inode's by inode order 779 + */ 780 + static int 781 + mext_page_double_lock(struct inode *inode1, struct inode *inode2, 782 + pgoff_t index, struct page *page[2]) 783 + { 784 + struct address_space *mapping[2]; 785 + unsigned fl = AOP_FLAG_NOFS; 786 + 787 + BUG_ON(!inode1 || !inode2); 788 + if (inode1 < inode2) { 789 + mapping[0] = inode1->i_mapping; 790 + mapping[1] = inode2->i_mapping; 791 + } else { 792 + mapping[0] = inode2->i_mapping; 793 + mapping[1] = inode1->i_mapping; 794 + } 795 + 796 + page[0] = grab_cache_page_write_begin(mapping[0], index, fl); 797 + if (!page[0]) 798 + return -ENOMEM; 799 + 800 + page[1] = grab_cache_page_write_begin(mapping[1], index, fl); 801 + if (!page[1]) { 802 + unlock_page(page[0]); 803 + page_cache_release(page[0]); 804 + return -ENOMEM; 805 + } 806 + 807 + if (inode1 > inode2) { 808 + struct page *tmp; 809 + tmp = page[0]; 810 + page[0] = page[1]; 811 + page[1] = tmp; 812 + } 813 + return 0; 814 + } 815 + 816 + /* Force page buffers uptodate w/o dropping page's lock */ 817 + static int 818 + mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) 819 + { 820 + struct inode *inode = page->mapping->host; 821 + sector_t block; 822 + struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; 823 + unsigned int blocksize, block_start, block_end; 824 + int i, err, nr = 0, partial = 0; 825 + BUG_ON(!PageLocked(page)); 826 + BUG_ON(PageWriteback(page)); 827 + 828 + if (PageUptodate(page)) 829 + return 0; 830 + 831 + blocksize = 1 << inode->i_blkbits; 832 + if (!page_has_buffers(page)) 833 + create_empty_buffers(page, blocksize, 0); 834 + 835 + head = page_buffers(page); 836 + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 837 + for (bh = head, block_start = 0; bh != head || !block_start; 838 + block++, block_start = block_end, bh = bh->b_this_page) { 839 + block_end = block_start + blocksize; 840 + if (block_end <= from || block_start >= to) { 841 + if (!buffer_uptodate(bh)) 842 + partial = 1; 843 + continue; 844 + } 845 + if (buffer_uptodate(bh)) 846 + continue; 847 + if (!buffer_mapped(bh)) { 848 + int err = 0; 849 + err = ext4_get_block(inode, block, bh, 0); 850 + if (err) { 851 + SetPageError(page); 852 + return err; 853 + } 854 + if (!buffer_mapped(bh)) { 855 + zero_user(page, block_start, blocksize); 856 + if (!err) 857 + set_buffer_uptodate(bh); 858 + continue; 859 + } 860 + } 861 + BUG_ON(nr >= MAX_BUF_PER_PAGE); 862 + arr[nr++] = bh; 863 + } 864 + /* No io required */ 865 + if (!nr) 866 + goto out; 867 + 868 + for (i = 0; i < nr; i++) { 869 + bh = arr[i]; 870 + if (!bh_uptodate_or_lock(bh)) { 871 + err = bh_submit_read(bh); 872 + if (err) 873 + return err; 874 + } 875 + } 876 + out: 877 + if (!partial) 878 + SetPageUptodate(page); 879 + return 0; 769 880 } 770 881 771 882 /** ··· 900 791 int block_len_in_page, int uninit, int *err) 901 792 { 902 793 struct inode *orig_inode = o_filp->f_dentry->d_inode; 903 - struct address_space *mapping = orig_inode->i_mapping; 904 - struct buffer_head *bh; 905 - struct page *page = NULL; 906 - const struct address_space_operations *a_ops = mapping->a_ops; 794 + struct page *pagep[2] = {NULL, NULL}; 907 795 handle_t *handle; 908 796 ext4_lblk_t orig_blk_offset; 909 797 long long offs = orig_page_offset << PAGE_CACHE_SHIFT; 910 798 unsigned long blocksize = orig_inode->i_sb->s_blocksize; 911 799 unsigned int w_flags = 0; 912 800 unsigned int tmp_data_size, data_size, replaced_size; 913 - void *fsdata; 914 - int i, jblocks; 915 - int err2 = 0; 801 + int err2, jblocks, retries = 0; 916 802 int replaced_count = 0; 803 + int from = data_offset_in_page << orig_inode->i_blkbits; 917 804 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; 918 805 919 806 /* 920 807 * It needs twice the amount of ordinary journal buffers because 921 808 * inode and donor_inode may change each different metadata blocks. 922 809 */ 810 + again: 811 + *err = 0; 923 812 jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; 924 813 handle = ext4_journal_start(orig_inode, jblocks); 925 814 if (IS_ERR(handle)) { ··· 930 823 931 824 orig_blk_offset = orig_page_offset * blocks_per_page + 932 825 data_offset_in_page; 933 - 934 - /* 935 - * If orig extent is uninitialized one, 936 - * it's not necessary force the page into memory 937 - * and then force it to be written out again. 938 - * Just swap data blocks between orig and donor. 939 - */ 940 - if (uninit) { 941 - replaced_count = mext_replace_branches(handle, orig_inode, 942 - donor_inode, orig_blk_offset, 943 - block_len_in_page, err); 944 - goto out2; 945 - } 946 826 947 827 offs = (long long)orig_blk_offset << orig_inode->i_blkbits; 948 828 ··· 952 858 953 859 replaced_size = data_size; 954 860 955 - *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, 956 - &page, &fsdata); 861 + *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset, 862 + pagep); 957 863 if (unlikely(*err < 0)) 958 - goto out; 864 + goto stop_journal; 865 + /* 866 + * If orig extent was uninitialized it can become initialized 867 + * at any time after i_data_sem was dropped, in order to 868 + * serialize with delalloc we have recheck extent while we 869 + * hold page's lock, if it is still the case data copy is not 870 + * necessary, just swap data blocks between orig and donor. 871 + */ 872 + if (uninit) { 873 + double_down_write_data_sem(orig_inode, donor_inode); 874 + /* If any of extents in range became initialized we have to 875 + * fallback to data copying */ 876 + uninit = mext_check_coverage(orig_inode, orig_blk_offset, 877 + block_len_in_page, 1, err); 878 + if (*err) 879 + goto drop_data_sem; 959 880 960 - if (!PageUptodate(page)) { 961 - mapping->a_ops->readpage(o_filp, page); 962 - lock_page(page); 881 + uninit &= mext_check_coverage(donor_inode, orig_blk_offset, 882 + block_len_in_page, 1, err); 883 + if (*err) 884 + goto drop_data_sem; 885 + 886 + if (!uninit) { 887 + double_up_write_data_sem(orig_inode, donor_inode); 888 + goto data_copy; 889 + } 890 + if ((page_has_private(pagep[0]) && 891 + !try_to_release_page(pagep[0], 0)) || 892 + (page_has_private(pagep[1]) && 893 + !try_to_release_page(pagep[1], 0))) { 894 + *err = -EBUSY; 895 + goto drop_data_sem; 896 + } 897 + replaced_count = mext_replace_branches(handle, orig_inode, 898 + donor_inode, orig_blk_offset, 899 + block_len_in_page, err); 900 + drop_data_sem: 901 + double_up_write_data_sem(orig_inode, donor_inode); 902 + goto unlock_pages; 903 + } 904 + data_copy: 905 + *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size); 906 + if (*err) 907 + goto unlock_pages; 908 + 909 + /* At this point all buffers in range are uptodate, old mapping layout 910 + * is no longer required, try to drop it now. */ 911 + if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) || 912 + (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) { 913 + *err = -EBUSY; 914 + goto unlock_pages; 963 915 } 964 916 965 - /* 966 - * try_to_release_page() doesn't call releasepage in writeback mode. 967 - * We should care about the order of writing to the same file 968 - * by multiple move extent processes. 969 - * It needs to call wait_on_page_writeback() to wait for the 970 - * writeback of the page. 971 - */ 972 - wait_on_page_writeback(page); 973 - 974 - /* Release old bh and drop refs */ 975 - try_to_release_page(page, 0); 976 - 977 917 replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, 978 - orig_blk_offset, block_len_in_page, 979 - &err2); 980 - if (err2) { 918 + orig_blk_offset, 919 + block_len_in_page, err); 920 + if (*err) { 981 921 if (replaced_count) { 982 922 block_len_in_page = replaced_count; 983 923 replaced_size = 984 924 block_len_in_page << orig_inode->i_blkbits; 985 925 } else 986 - goto out; 926 + goto unlock_pages; 987 927 } 928 + /* Perform all necessary steps similar write_begin()/write_end() 929 + * but keeping in mind that i_size will not change */ 930 + *err = __block_write_begin(pagep[0], from, from + replaced_size, 931 + ext4_get_block); 932 + if (!*err) 933 + *err = block_commit_write(pagep[0], from, from + replaced_size); 988 934 989 - if (!page_has_buffers(page)) 990 - create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); 935 + if (unlikely(*err < 0)) 936 + goto repair_branches; 991 937 992 - bh = page_buffers(page); 993 - for (i = 0; i < data_offset_in_page; i++) 994 - bh = bh->b_this_page; 938 + /* Even in case of data=writeback it is reasonable to pin 939 + * inode to transaction, to prevent unexpected data loss */ 940 + *err = ext4_jbd2_file_inode(handle, orig_inode); 995 941 996 - for (i = 0; i < block_len_in_page; i++) { 997 - *err = ext4_get_block(orig_inode, 998 - (sector_t)(orig_blk_offset + i), bh, 0); 999 - if (*err < 0) 1000 - goto out; 1001 - 1002 - if (bh->b_this_page != NULL) 1003 - bh = bh->b_this_page; 1004 - } 1005 - 1006 - *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size, 1007 - page, fsdata); 1008 - page = NULL; 1009 - 1010 - out: 1011 - if (unlikely(page)) { 1012 - if (PageLocked(page)) 1013 - unlock_page(page); 1014 - page_cache_release(page); 1015 - ext4_journal_stop(handle); 1016 - } 1017 - out2: 942 + unlock_pages: 943 + unlock_page(pagep[0]); 944 + page_cache_release(pagep[0]); 945 + unlock_page(pagep[1]); 946 + page_cache_release(pagep[1]); 947 + stop_journal: 1018 948 ext4_journal_stop(handle); 1019 - 1020 - if (err2) 1021 - *err = err2; 1022 - 949 + /* Buffer was busy because probably is pinned to journal transaction, 950 + * force transaction commit may help to free it. */ 951 + if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb, 952 + &retries)) 953 + goto again; 1023 954 return replaced_count; 955 + 956 + repair_branches: 957 + /* 958 + * This should never ever happen! 959 + * Extents are swapped already, but we are not able to copy data. 960 + * Try to swap extents to it's original places 961 + */ 962 + double_down_write_data_sem(orig_inode, donor_inode); 963 + replaced_count = mext_replace_branches(handle, donor_inode, orig_inode, 964 + orig_blk_offset, 965 + block_len_in_page, &err2); 966 + double_up_write_data_sem(orig_inode, donor_inode); 967 + if (replaced_count != block_len_in_page) { 968 + EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), 969 + "Unable to copy data block," 970 + " data will be lost."); 971 + *err = -EIO; 972 + } 973 + replaced_count = 0; 974 + goto unlock_pages; 1024 975 } 1025 976 1026 977 /** ··· 1108 969 return -EINVAL; 1109 970 } 1110 971 1111 - /* Files should be in the same ext4 FS */ 1112 - if (orig_inode->i_sb != donor_inode->i_sb) { 1113 - ext4_debug("ext4 move extent: The argument files " 1114 - "should be in same FS [ino:orig %lu, donor %lu]\n", 1115 - orig_inode->i_ino, donor_inode->i_ino); 1116 - return -EINVAL; 1117 - } 1118 - 1119 972 /* Ext4 move extent supports only extent based file */ 1120 973 if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { 1121 974 ext4_debug("ext4 move extent: orig file is not extents " ··· 1133 1002 } 1134 1003 1135 1004 if ((orig_start >= EXT_MAX_BLOCKS) || 1136 - (donor_start >= EXT_MAX_BLOCKS) || 1137 1005 (*len > EXT_MAX_BLOCKS) || 1138 1006 (orig_start + *len >= EXT_MAX_BLOCKS)) { 1139 1007 ext4_debug("ext4 move extent: Can't handle over [%u] blocks " ··· 1202 1072 * @inode1: the inode structure 1203 1073 * @inode2: the inode structure 1204 1074 * 1205 - * Lock two inodes' i_mutex by i_ino order. 1206 - * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. 1075 + * Lock two inodes' i_mutex 1207 1076 */ 1208 - static int 1077 + static void 1209 1078 mext_inode_double_lock(struct inode *inode1, struct inode *inode2) 1210 1079 { 1211 - int ret = 0; 1212 - 1213 - BUG_ON(inode1 == NULL && inode2 == NULL); 1214 - 1215 - ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); 1216 - if (ret < 0) 1217 - goto out; 1218 - 1219 - if (inode1 == inode2) { 1220 - mutex_lock(&inode1->i_mutex); 1221 - goto out; 1222 - } 1223 - 1224 - if (inode1->i_ino < inode2->i_ino) { 1080 + BUG_ON(inode1 == inode2); 1081 + if (inode1 < inode2) { 1225 1082 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); 1226 1083 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); 1227 1084 } else { 1228 1085 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); 1229 1086 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); 1230 1087 } 1231 - 1232 - out: 1233 - return ret; 1234 1088 } 1235 1089 1236 1090 /** ··· 1223 1109 * @inode1: the inode that is released first 1224 1110 * @inode2: the inode that is released second 1225 1111 * 1226 - * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. 1227 1112 */ 1228 1113 1229 - static int 1114 + static void 1230 1115 mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) 1231 1116 { 1232 - int ret = 0; 1233 - 1234 - BUG_ON(inode1 == NULL && inode2 == NULL); 1235 - 1236 - ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); 1237 - if (ret < 0) 1238 - goto out; 1239 - 1240 - if (inode1) 1241 - mutex_unlock(&inode1->i_mutex); 1242 - 1243 - if (inode2 && inode2 != inode1) 1244 - mutex_unlock(&inode2->i_mutex); 1245 - 1246 - out: 1247 - return ret; 1117 + mutex_unlock(&inode1->i_mutex); 1118 + mutex_unlock(&inode2->i_mutex); 1248 1119 } 1249 1120 1250 1121 /** ··· 1286 1187 ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; 1287 1188 ext4_lblk_t rest_blocks; 1288 1189 pgoff_t orig_page_offset = 0, seq_end_page; 1289 - int ret1, ret2, depth, last_extent = 0; 1190 + int ret, depth, last_extent = 0; 1290 1191 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; 1291 1192 int data_offset_in_page; 1292 1193 int block_len_in_page; 1293 1194 int uninit; 1294 1195 1295 - /* orig and donor should be different file */ 1296 - if (orig_inode->i_ino == donor_inode->i_ino) { 1196 + if (orig_inode->i_sb != donor_inode->i_sb) { 1197 + ext4_debug("ext4 move extent: The argument files " 1198 + "should be in same FS [ino:orig %lu, donor %lu]\n", 1199 + orig_inode->i_ino, donor_inode->i_ino); 1200 + return -EINVAL; 1201 + } 1202 + 1203 + /* orig and donor should be different inodes */ 1204 + if (orig_inode == donor_inode) { 1297 1205 ext4_debug("ext4 move extent: The argument files should not " 1298 - "be same file [ino:orig %lu, donor %lu]\n", 1206 + "be same inode [ino:orig %lu, donor %lu]\n", 1299 1207 orig_inode->i_ino, donor_inode->i_ino); 1300 1208 return -EINVAL; 1301 1209 } ··· 1314 1208 orig_inode->i_ino, donor_inode->i_ino); 1315 1209 return -EINVAL; 1316 1210 } 1317 - 1211 + /* TODO: This is non obvious task to swap blocks for inodes with full 1212 + jornaling enabled */ 1213 + if (ext4_should_journal_data(orig_inode) || 1214 + ext4_should_journal_data(donor_inode)) { 1215 + return -EINVAL; 1216 + } 1318 1217 /* Protect orig and donor inodes against a truncate */ 1319 - ret1 = mext_inode_double_lock(orig_inode, donor_inode); 1320 - if (ret1 < 0) 1321 - return ret1; 1218 + mext_inode_double_lock(orig_inode, donor_inode); 1219 + 1220 + /* Wait for all existing dio workers */ 1221 + ext4_inode_block_unlocked_dio(orig_inode); 1222 + ext4_inode_block_unlocked_dio(donor_inode); 1223 + inode_dio_wait(orig_inode); 1224 + inode_dio_wait(donor_inode); 1322 1225 1323 1226 /* Protect extent tree against block allocations via delalloc */ 1324 1227 double_down_write_data_sem(orig_inode, donor_inode); 1325 1228 /* Check the filesystem environment whether move_extent can be done */ 1326 - ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, 1229 + ret = mext_check_arguments(orig_inode, donor_inode, orig_start, 1327 1230 donor_start, &len); 1328 - if (ret1) 1231 + if (ret) 1329 1232 goto out; 1330 1233 1331 1234 file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; ··· 1342 1227 if (file_end < block_end) 1343 1228 len -= block_end - file_end; 1344 1229 1345 - ret1 = get_ext_path(orig_inode, block_start, &orig_path); 1346 - if (ret1) 1230 + ret = get_ext_path(orig_inode, block_start, &orig_path); 1231 + if (ret) 1347 1232 goto out; 1348 1233 1349 1234 /* Get path structure to check the hole */ 1350 - ret1 = get_ext_path(orig_inode, block_start, &holecheck_path); 1351 - if (ret1) 1235 + ret = get_ext_path(orig_inode, block_start, &holecheck_path); 1236 + if (ret) 1352 1237 goto out; 1353 1238 1354 1239 depth = ext_depth(orig_inode); ··· 1367 1252 last_extent = mext_next_extent(orig_inode, 1368 1253 holecheck_path, &ext_cur); 1369 1254 if (last_extent < 0) { 1370 - ret1 = last_extent; 1255 + ret = last_extent; 1371 1256 goto out; 1372 1257 } 1373 1258 last_extent = mext_next_extent(orig_inode, orig_path, 1374 1259 &ext_dummy); 1375 1260 if (last_extent < 0) { 1376 - ret1 = last_extent; 1261 + ret = last_extent; 1377 1262 goto out; 1378 1263 } 1379 1264 seq_start = le32_to_cpu(ext_cur->ee_block); ··· 1387 1272 if (le32_to_cpu(ext_cur->ee_block) > block_end) { 1388 1273 ext4_debug("ext4 move extent: The specified range of file " 1389 1274 "may be the hole\n"); 1390 - ret1 = -EINVAL; 1275 + ret = -EINVAL; 1391 1276 goto out; 1392 1277 } 1393 1278 ··· 1407 1292 last_extent = mext_next_extent(orig_inode, holecheck_path, 1408 1293 &ext_cur); 1409 1294 if (last_extent < 0) { 1410 - ret1 = last_extent; 1295 + ret = last_extent; 1411 1296 break; 1412 1297 } 1413 1298 add_blocks = ext4_ext_get_actual_len(ext_cur); ··· 1464 1349 orig_page_offset, 1465 1350 data_offset_in_page, 1466 1351 block_len_in_page, uninit, 1467 - &ret1); 1352 + &ret); 1468 1353 1469 1354 /* Count how many blocks we have exchanged */ 1470 1355 *moved_len += block_len_in_page; 1471 - if (ret1 < 0) 1356 + if (ret < 0) 1472 1357 break; 1473 1358 if (*moved_len > len) { 1474 1359 EXT4_ERROR_INODE(orig_inode, 1475 1360 "We replaced blocks too much! " 1476 1361 "sum of replaced: %llu requested: %llu", 1477 1362 *moved_len, len); 1478 - ret1 = -EIO; 1363 + ret = -EIO; 1479 1364 break; 1480 1365 } 1481 1366 ··· 1489 1374 } 1490 1375 1491 1376 double_down_write_data_sem(orig_inode, donor_inode); 1492 - if (ret1 < 0) 1377 + if (ret < 0) 1493 1378 break; 1494 1379 1495 1380 /* Decrease buffer counter */ 1496 1381 if (holecheck_path) 1497 1382 ext4_ext_drop_refs(holecheck_path); 1498 - ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path); 1499 - if (ret1) 1383 + ret = get_ext_path(orig_inode, seq_start, &holecheck_path); 1384 + if (ret) 1500 1385 break; 1501 1386 depth = holecheck_path->p_depth; 1502 1387 1503 1388 /* Decrease buffer counter */ 1504 1389 if (orig_path) 1505 1390 ext4_ext_drop_refs(orig_path); 1506 - ret1 = get_ext_path(orig_inode, seq_start, &orig_path); 1507 - if (ret1) 1391 + ret = get_ext_path(orig_inode, seq_start, &orig_path); 1392 + if (ret) 1508 1393 break; 1509 1394 1510 1395 ext_cur = holecheck_path[depth].p_ext; ··· 1527 1412 kfree(holecheck_path); 1528 1413 } 1529 1414 double_up_write_data_sem(orig_inode, donor_inode); 1530 - ret2 = mext_inode_double_unlock(orig_inode, donor_inode); 1415 + ext4_inode_resume_unlocked_dio(orig_inode); 1416 + ext4_inode_resume_unlocked_dio(donor_inode); 1417 + mext_inode_double_unlock(orig_inode, donor_inode); 1531 1418 1532 - if (ret1) 1533 - return ret1; 1534 - else if (ret2) 1535 - return ret2; 1536 - 1537 - return 0; 1419 + return ret; 1538 1420 }

+86 -19

fs/ext4/namei.c

··· 55 55 { 56 56 struct buffer_head *bh; 57 57 58 + if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && 59 + ((inode->i_size >> 10) >= 60 + EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) { 61 + *err = -ENOSPC; 62 + return NULL; 63 + } 64 + 58 65 *block = inode->i_size >> inode->i_sb->s_blocksize_bits; 59 66 60 67 bh = ext4_bread(handle, inode, *block, 1, err); ··· 73 66 brelse(bh); 74 67 bh = NULL; 75 68 } 69 + } 70 + if (!bh && !(*err)) { 71 + *err = -EIO; 72 + ext4_error(inode->i_sb, 73 + "Directory hole detected on inode %lu\n", 74 + inode->i_ino); 76 75 } 77 76 return bh; 78 77 } ··· 607 594 u32 hash; 608 595 609 596 frame->bh = NULL; 610 - if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) 597 + if (!(bh = ext4_bread(NULL, dir, 0, 0, err))) { 598 + if (*err == 0) 599 + *err = ERR_BAD_DX_DIR; 611 600 goto fail; 601 + } 612 602 root = (struct dx_root *) bh->b_data; 613 603 if (root->info.hash_version != DX_HASH_TEA && 614 604 root->info.hash_version != DX_HASH_HALF_MD4 && ··· 712 696 frame->entries = entries; 713 697 frame->at = at; 714 698 if (!indirect--) return frame; 715 - if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err))) 699 + if (!(bh = ext4_bread(NULL, dir, dx_get_block(at), 0, err))) { 700 + if (!(*err)) 701 + *err = ERR_BAD_DX_DIR; 716 702 goto fail2; 703 + } 717 704 at = entries = ((struct dx_node *) bh->b_data)->entries; 718 705 719 706 if (!buffer_verified(bh) && ··· 826 807 */ 827 808 while (num_frames--) { 828 809 if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at), 829 - 0, &err))) 810 + 0, &err))) { 811 + if (!err) { 812 + ext4_error(dir->i_sb, 813 + "Directory hole detected on inode %lu\n", 814 + dir->i_ino); 815 + return -EIO; 816 + } 830 817 return err; /* Failure */ 818 + } 831 819 832 820 if (!buffer_verified(bh) && 833 821 !ext4_dx_csum_verify(dir, ··· 865 839 { 866 840 struct buffer_head *bh; 867 841 struct ext4_dir_entry_2 *de, *top; 868 - int err, count = 0; 842 + int err = 0, count = 0; 869 843 870 844 dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", 871 845 (unsigned long)block)); 872 - if (!(bh = ext4_bread (NULL, dir, block, 0, &err))) 846 + if (!(bh = ext4_bread(NULL, dir, block, 0, &err))) { 847 + if (!err) { 848 + err = -EIO; 849 + ext4_error(dir->i_sb, 850 + "Directory hole detected on inode %lu\n", 851 + dir->i_ino); 852 + } 873 853 return err; 854 + } 874 855 875 856 if (!buffer_verified(bh) && 876 857 !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) ··· 1300 1267 return NULL; 1301 1268 do { 1302 1269 block = dx_get_block(frame->at); 1303 - if (!(bh = ext4_bread(NULL, dir, block, 0, err))) 1270 + if (!(bh = ext4_bread(NULL, dir, block, 0, err))) { 1271 + if (!(*err)) { 1272 + *err = -EIO; 1273 + ext4_error(dir->i_sb, 1274 + "Directory hole detected on inode %lu\n", 1275 + dir->i_ino); 1276 + } 1304 1277 goto errout; 1278 + } 1305 1279 1306 1280 if (!buffer_verified(bh) && 1307 1281 !ext4_dirent_csum_verify(dir, ··· 1841 1801 } 1842 1802 blocks = dir->i_size >> sb->s_blocksize_bits; 1843 1803 for (block = 0; block < blocks; block++) { 1844 - bh = ext4_bread(handle, dir, block, 0, &retval); 1845 - if(!bh) 1804 + if (!(bh = ext4_bread(handle, dir, block, 0, &retval))) { 1805 + if (!retval) { 1806 + retval = -EIO; 1807 + ext4_error(inode->i_sb, 1808 + "Directory hole detected on inode %lu\n", 1809 + inode->i_ino); 1810 + } 1846 1811 return retval; 1812 + } 1847 1813 if (!buffer_verified(bh) && 1848 1814 !ext4_dirent_csum_verify(dir, 1849 1815 (struct ext4_dir_entry *)bh->b_data)) ··· 1906 1860 entries = frame->entries; 1907 1861 at = frame->at; 1908 1862 1909 - if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err))) 1863 + if (!(bh = ext4_bread(handle, dir, dx_get_block(frame->at), 0, &err))) { 1864 + if (!err) { 1865 + err = -EIO; 1866 + ext4_error(dir->i_sb, 1867 + "Directory hole detected on inode %lu\n", 1868 + dir->i_ino); 1869 + } 1910 1870 goto cleanup; 1871 + } 1911 1872 1912 1873 if (!buffer_verified(bh) && 1913 1874 !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) ··· 2202 2149 err = PTR_ERR(inode); 2203 2150 if (!IS_ERR(inode)) { 2204 2151 init_special_inode(inode, inode->i_mode, rdev); 2205 - #ifdef CONFIG_EXT4_FS_XATTR 2206 2152 inode->i_op = &ext4_special_inode_operations; 2207 - #endif 2208 2153 err = ext4_add_nondir(handle, dentry, inode); 2209 2154 } 2210 2155 ext4_journal_stop(handle); ··· 2250 2199 inode->i_op = &ext4_dir_inode_operations; 2251 2200 inode->i_fop = &ext4_dir_operations; 2252 2201 inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; 2253 - dir_block = ext4_bread(handle, inode, 0, 1, &err); 2254 - if (!dir_block) 2202 + if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) { 2203 + if (!err) { 2204 + err = -EIO; 2205 + ext4_error(inode->i_sb, 2206 + "Directory hole detected on inode %lu\n", 2207 + inode->i_ino); 2208 + } 2255 2209 goto out_clear_inode; 2210 + } 2256 2211 BUFFER_TRACE(dir_block, "get_write_access"); 2257 2212 err = ext4_journal_get_write_access(handle, dir_block); 2258 2213 if (err) ··· 2375 2318 EXT4_ERROR_INODE(inode, 2376 2319 "error %d reading directory " 2377 2320 "lblock %u", err, lblock); 2321 + else 2322 + ext4_warning(inode->i_sb, 2323 + "bad directory (dir #%lu) - no data block", 2324 + inode->i_ino); 2325 + 2378 2326 offset += sb->s_blocksize; 2379 2327 continue; 2380 2328 } ··· 2424 2362 struct ext4_iloc iloc; 2425 2363 int err = 0, rc; 2426 2364 2427 - if (!ext4_handle_valid(handle)) 2365 + if (!EXT4_SB(sb)->s_journal) 2428 2366 return 0; 2429 2367 2430 2368 mutex_lock(&EXT4_SB(sb)->s_orphan_lock); ··· 2498 2436 struct ext4_iloc iloc; 2499 2437 int err = 0; 2500 2438 2501 - /* ext4_handle_valid() assumes a valid handle_t pointer */ 2502 - if (handle && !ext4_handle_valid(handle)) 2439 + if (!EXT4_SB(inode->i_sb)->s_journal) 2503 2440 return 0; 2504 2441 2505 2442 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); ··· 2517 2456 * transaction handle with which to update the orphan list on 2518 2457 * disk, but we still need to remove the inode from the linked 2519 2458 * list in memory. */ 2520 - if (sbi->s_journal && !handle) 2459 + if (!handle) 2521 2460 goto out; 2522 2461 2523 2462 err = ext4_reserve_inode_write(handle, inode, &iloc); ··· 2887 2826 goto end_rename; 2888 2827 } 2889 2828 retval = -EIO; 2890 - dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); 2891 - if (!dir_bh) 2829 + if (!(dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval))) { 2830 + if (!retval) { 2831 + retval = -EIO; 2832 + ext4_error(old_inode->i_sb, 2833 + "Directory hole detected on inode %lu\n", 2834 + old_inode->i_ino); 2835 + } 2892 2836 goto end_rename; 2837 + } 2893 2838 if (!buffer_verified(dir_bh) && 2894 2839 !ext4_dirent_csum_verify(old_inode, 2895 2840 (struct ext4_dir_entry *)dir_bh->b_data))

+120 -58

fs/ext4/page-io.c

··· 71 71 int i; 72 72 73 73 BUG_ON(!io); 74 + BUG_ON(!list_empty(&io->list)); 75 + BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); 76 + 74 77 if (io->page) 75 78 put_page(io->page); 76 79 for (i = 0; i < io->num_io_pages; i++) ··· 84 81 kmem_cache_free(io_end_cachep, io); 85 82 } 86 83 87 - /* 88 - * check a range of space and convert unwritten extents to written. 89 - * 90 - * Called with inode->i_mutex; we depend on this when we manipulate 91 - * io->flag, since we could otherwise race with ext4_flush_completed_IO() 92 - */ 93 - int ext4_end_io_nolock(ext4_io_end_t *io) 84 + /* check a range of space and convert unwritten extents to written. */ 85 + static int ext4_end_io(ext4_io_end_t *io) 94 86 { 95 87 struct inode *inode = io->inode; 96 88 loff_t offset = io->offset; ··· 104 106 "(inode %lu, offset %llu, size %zd, error %d)", 105 107 inode->i_ino, offset, size, ret); 106 108 } 107 - 108 109 if (io->iocb) 109 110 aio_complete(io->iocb, io->result, 0); 110 111 111 112 if (io->flag & EXT4_IO_END_DIRECT) 112 113 inode_dio_done(inode); 113 114 /* Wake up anyone waiting on unwritten extent conversion */ 114 - if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) 115 + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) 115 116 wake_up_all(ext4_ioend_wq(io->inode)); 117 + return ret; 118 + } 119 + 120 + static void dump_completed_IO(struct inode *inode) 121 + { 122 + #ifdef EXT4FS_DEBUG 123 + struct list_head *cur, *before, *after; 124 + ext4_io_end_t *io, *io0, *io1; 125 + unsigned long flags; 126 + 127 + if (list_empty(&EXT4_I(inode)->i_completed_io_list)) { 128 + ext4_debug("inode %lu completed_io list is empty\n", 129 + inode->i_ino); 130 + return; 131 + } 132 + 133 + ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino); 134 + list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) { 135 + cur = &io->list; 136 + before = cur->prev; 137 + io0 = container_of(before, ext4_io_end_t, list); 138 + after = cur->next; 139 + io1 = container_of(after, ext4_io_end_t, list); 140 + 141 + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", 142 + io, inode->i_ino, io0, io1); 143 + } 144 + #endif 145 + } 146 + 147 + /* Add the io_end to per-inode completed end_io list. */ 148 + void ext4_add_complete_io(ext4_io_end_t *io_end) 149 + { 150 + struct ext4_inode_info *ei = EXT4_I(io_end->inode); 151 + struct workqueue_struct *wq; 152 + unsigned long flags; 153 + 154 + BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); 155 + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 156 + 157 + spin_lock_irqsave(&ei->i_completed_io_lock, flags); 158 + if (list_empty(&ei->i_completed_io_list)) { 159 + io_end->flag |= EXT4_IO_END_QUEUED; 160 + queue_work(wq, &io_end->work); 161 + } 162 + list_add_tail(&io_end->list, &ei->i_completed_io_list); 163 + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 164 + } 165 + 166 + static int ext4_do_flush_completed_IO(struct inode *inode, 167 + ext4_io_end_t *work_io) 168 + { 169 + ext4_io_end_t *io; 170 + struct list_head unwritten, complete, to_free; 171 + unsigned long flags; 172 + struct ext4_inode_info *ei = EXT4_I(inode); 173 + int err, ret = 0; 174 + 175 + INIT_LIST_HEAD(&complete); 176 + INIT_LIST_HEAD(&to_free); 177 + 178 + spin_lock_irqsave(&ei->i_completed_io_lock, flags); 179 + dump_completed_IO(inode); 180 + list_replace_init(&ei->i_completed_io_list, &unwritten); 181 + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 182 + 183 + while (!list_empty(&unwritten)) { 184 + io = list_entry(unwritten.next, ext4_io_end_t, list); 185 + BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); 186 + list_del_init(&io->list); 187 + 188 + err = ext4_end_io(io); 189 + if (unlikely(!ret && err)) 190 + ret = err; 191 + 192 + list_add_tail(&io->list, &complete); 193 + } 194 + spin_lock_irqsave(&ei->i_completed_io_lock, flags); 195 + while (!list_empty(&complete)) { 196 + io = list_entry(complete.next, ext4_io_end_t, list); 197 + io->flag &= ~EXT4_IO_END_UNWRITTEN; 198 + /* end_io context can not be destroyed now because it still 199 + * used by queued worker. Worker thread will destroy it later */ 200 + if (io->flag & EXT4_IO_END_QUEUED) 201 + list_del_init(&io->list); 202 + else 203 + list_move(&io->list, &to_free); 204 + } 205 + /* If we are called from worker context, it is time to clear queued 206 + * flag, and destroy it's end_io if it was converted already */ 207 + if (work_io) { 208 + work_io->flag &= ~EXT4_IO_END_QUEUED; 209 + if (!(work_io->flag & EXT4_IO_END_UNWRITTEN)) 210 + list_add_tail(&work_io->list, &to_free); 211 + } 212 + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 213 + 214 + while (!list_empty(&to_free)) { 215 + io = list_entry(to_free.next, ext4_io_end_t, list); 216 + list_del_init(&io->list); 217 + ext4_free_io_end(io); 218 + } 116 219 return ret; 117 220 } 118 221 ··· 222 123 */ 223 124 static void ext4_end_io_work(struct work_struct *work) 224 125 { 225 - ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); 226 - struct inode *inode = io->inode; 227 - struct ext4_inode_info *ei = EXT4_I(inode); 228 - unsigned long flags; 126 + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); 127 + ext4_do_flush_completed_IO(io->inode, io); 128 + } 229 129 230 - spin_lock_irqsave(&ei->i_completed_io_lock, flags); 231 - if (io->flag & EXT4_IO_END_IN_FSYNC) 232 - goto requeue; 233 - if (list_empty(&io->list)) { 234 - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 235 - goto free; 236 - } 237 - 238 - if (!mutex_trylock(&inode->i_mutex)) { 239 - bool was_queued; 240 - requeue: 241 - was_queued = !!(io->flag & EXT4_IO_END_QUEUED); 242 - io->flag |= EXT4_IO_END_QUEUED; 243 - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 244 - /* 245 - * Requeue the work instead of waiting so that the work 246 - * items queued after this can be processed. 247 - */ 248 - queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work); 249 - /* 250 - * To prevent the ext4-dio-unwritten thread from keeping 251 - * requeueing end_io requests and occupying cpu for too long, 252 - * yield the cpu if it sees an end_io request that has already 253 - * been requeued. 254 - */ 255 - if (was_queued) 256 - yield(); 257 - return; 258 - } 259 - list_del_init(&io->list); 260 - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 261 - (void) ext4_end_io_nolock(io); 262 - mutex_unlock(&inode->i_mutex); 263 - free: 264 - ext4_free_io_end(io); 130 + int ext4_flush_unwritten_io(struct inode *inode) 131 + { 132 + int ret; 133 + WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) && 134 + !(inode->i_state & I_FREEING)); 135 + ret = ext4_do_flush_completed_IO(inode, NULL); 136 + ext4_unwritten_wait(inode); 137 + return ret; 265 138 } 266 139 267 140 ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) ··· 266 195 static void ext4_end_bio(struct bio *bio, int error) 267 196 { 268 197 ext4_io_end_t *io_end = bio->bi_private; 269 - struct workqueue_struct *wq; 270 198 struct inode *inode; 271 - unsigned long flags; 272 199 int i; 273 200 sector_t bi_sector = bio->bi_sector; 274 201 ··· 324 255 return; 325 256 } 326 257 327 - /* Add the io_end to per-inode completed io list*/ 328 - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); 329 - list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); 330 - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); 331 - 332 - wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; 333 - /* queue the work to convert unwritten extents to written */ 334 - queue_work(wq, &io_end->work); 258 + ext4_add_complete_io(io_end); 335 259 } 336 260 337 261 void ext4_io_submit(struct ext4_io_submit *io)

+344 -90

fs/ext4/resize.c

··· 45 45 smp_mb__after_clear_bit(); 46 46 } 47 47 48 + static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb, 49 + ext4_group_t group) { 50 + return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) << 51 + EXT4_DESC_PER_BLOCK_BITS(sb); 52 + } 53 + 54 + static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb, 55 + ext4_group_t group) { 56 + group = ext4_meta_bg_first_group(sb, group); 57 + return ext4_group_first_block_no(sb, group); 58 + } 59 + 60 + static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb, 61 + ext4_group_t group) { 62 + ext4_grpblk_t overhead; 63 + overhead = ext4_bg_num_gdb(sb, group); 64 + if (ext4_bg_has_super(sb, group)) 65 + overhead += 1 + 66 + le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); 67 + return overhead; 68 + } 69 + 48 70 #define outside(b, first, last) ((b) < (first) || (b) >= (last)) 49 71 #define inside(b, first, last) ((b) >= (first) && (b) < (last)) 50 72 ··· 79 57 ext4_fsblk_t end = start + input->blocks_count; 80 58 ext4_group_t group = input->group; 81 59 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; 82 - unsigned overhead = ext4_bg_has_super(sb, group) ? 83 - (1 + ext4_bg_num_gdb(sb, group) + 84 - le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; 60 + unsigned overhead = ext4_group_overhead_blocks(sb, group); 85 61 ext4_fsblk_t metaend = start + overhead; 86 62 struct buffer_head *bh = NULL; 87 63 ext4_grpblk_t free_blocks_count, offset; ··· 220 200 * be a partial of a flex group. 221 201 * 222 202 * @sb: super block of fs to which the groups belongs 203 + * 204 + * Returns 0 on a successful allocation of the metadata blocks in the 205 + * block group. 223 206 */ 224 - static void ext4_alloc_group_tables(struct super_block *sb, 207 + static int ext4_alloc_group_tables(struct super_block *sb, 225 208 struct ext4_new_flex_group_data *flex_gd, 226 209 int flexbg_size) 227 210 { 228 211 struct ext4_new_group_data *group_data = flex_gd->groups; 229 - struct ext4_super_block *es = EXT4_SB(sb)->s_es; 230 212 ext4_fsblk_t start_blk; 231 213 ext4_fsblk_t last_blk; 232 214 ext4_group_t src_group; ··· 248 226 (last_group & ~(flexbg_size - 1)))); 249 227 next_group: 250 228 group = group_data[0].group; 229 + if (src_group >= group_data[0].group + flex_gd->count) 230 + return -ENOSPC; 251 231 start_blk = ext4_group_first_block_no(sb, src_group); 252 232 last_blk = start_blk + group_data[src_group - group].blocks_count; 253 233 254 - overhead = ext4_bg_has_super(sb, src_group) ? 255 - (1 + ext4_bg_num_gdb(sb, src_group) + 256 - le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; 234 + overhead = ext4_group_overhead_blocks(sb, src_group); 257 235 258 236 start_blk += overhead; 259 237 260 - BUG_ON(src_group >= group_data[0].group + flex_gd->count); 261 238 /* We collect contiguous blocks as much as possible. */ 262 239 src_group++; 263 - for (; src_group <= last_group; src_group++) 264 - if (!ext4_bg_has_super(sb, src_group)) 240 + for (; src_group <= last_group; src_group++) { 241 + overhead = ext4_group_overhead_blocks(sb, src_group); 242 + if (overhead != 0) 265 243 last_blk += group_data[src_group - group].blocks_count; 266 244 else 267 245 break; 246 + } 268 247 269 248 /* Allocate block bitmaps */ 270 249 for (; bb_index < flex_gd->count; bb_index++) { ··· 323 300 group_data[i].free_blocks_count); 324 301 } 325 302 } 303 + return 0; 326 304 } 327 305 328 306 static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, ··· 457 433 ext4_group_t group, count; 458 434 struct buffer_head *bh = NULL; 459 435 int reserved_gdb, i, j, err = 0, err2; 436 + int meta_bg; 460 437 461 438 BUG_ON(!flex_gd->count || !group_data || 462 439 group_data[0].group != sbi->s_groups_count); 463 440 464 441 reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); 442 + meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); 465 443 466 444 /* This transaction may be extended/restarted along the way */ 467 445 handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); ··· 473 447 group = group_data[0].group; 474 448 for (i = 0; i < flex_gd->count; i++, group++) { 475 449 unsigned long gdblocks; 450 + ext4_grpblk_t overhead; 476 451 477 452 gdblocks = ext4_bg_num_gdb(sb, group); 478 453 start = ext4_group_first_block_no(sb, group); 479 454 455 + if (meta_bg == 0 && !ext4_bg_has_super(sb, group)) 456 + goto handle_itb; 457 + 458 + if (meta_bg == 1) { 459 + ext4_group_t first_group; 460 + first_group = ext4_meta_bg_first_group(sb, group); 461 + if (first_group != group + 1 && 462 + first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1) 463 + goto handle_itb; 464 + } 465 + 466 + block = start + ext4_bg_has_super(sb, group); 480 467 /* Copy all of the GDT blocks into the backup in this group */ 481 - for (j = 0, block = start + 1; j < gdblocks; j++, block++) { 468 + for (j = 0; j < gdblocks; j++, block++) { 482 469 struct buffer_head *gdb; 483 470 484 471 ext4_debug("update backup group %#04llx\n", block); ··· 532 493 goto out; 533 494 } 534 495 496 + handle_itb: 535 497 /* Initialize group tables of the grop @group */ 536 498 if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED)) 537 499 goto handle_bb; ··· 561 521 err = PTR_ERR(bh); 562 522 goto out; 563 523 } 564 - if (ext4_bg_has_super(sb, group)) { 524 + overhead = ext4_group_overhead_blocks(sb, group); 525 + if (overhead != 0) { 565 526 ext4_debug("mark backup superblock %#04llx (+0)\n", 566 527 start); 567 - ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 568 - 1); 528 + ext4_set_bits(bh->b_data, 0, overhead); 569 529 } 570 530 ext4_mark_bitmap_end(group_data[i].blocks_count, 571 531 sb->s_blocksize * 8, bh->b_data); ··· 862 822 } 863 823 864 824 /* 825 + * add_new_gdb_meta_bg is the sister of add_new_gdb. 826 + */ 827 + static int add_new_gdb_meta_bg(struct super_block *sb, 828 + handle_t *handle, ext4_group_t group) { 829 + ext4_fsblk_t gdblock; 830 + struct buffer_head *gdb_bh; 831 + struct buffer_head **o_group_desc, **n_group_desc; 832 + unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); 833 + int err; 834 + 835 + gdblock = ext4_meta_bg_first_block_no(sb, group) + 836 + ext4_bg_has_super(sb, group); 837 + gdb_bh = sb_bread(sb, gdblock); 838 + if (!gdb_bh) 839 + return -EIO; 840 + n_group_desc = ext4_kvmalloc((gdb_num + 1) * 841 + sizeof(struct buffer_head *), 842 + GFP_NOFS); 843 + if (!n_group_desc) { 844 + err = -ENOMEM; 845 + ext4_warning(sb, "not enough memory for %lu groups", 846 + gdb_num + 1); 847 + return err; 848 + } 849 + 850 + o_group_desc = EXT4_SB(sb)->s_group_desc; 851 + memcpy(n_group_desc, o_group_desc, 852 + EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); 853 + n_group_desc[gdb_num] = gdb_bh; 854 + EXT4_SB(sb)->s_group_desc = n_group_desc; 855 + EXT4_SB(sb)->s_gdb_count++; 856 + ext4_kvfree(o_group_desc); 857 + err = ext4_journal_get_write_access(handle, gdb_bh); 858 + if (unlikely(err)) 859 + brelse(gdb_bh); 860 + return err; 861 + } 862 + 863 + /* 865 864 * Called when we are adding a new group which has a backup copy of each of 866 865 * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. 867 866 * We need to add these reserved backup GDT blocks to the resize inode, so ··· 1028 949 * do not copy the full number of backups at this time. The resize 1029 950 * which changed s_groups_count will backup again. 1030 951 */ 1031 - static void update_backups(struct super_block *sb, 1032 - int blk_off, char *data, int size) 952 + static void update_backups(struct super_block *sb, int blk_off, char *data, 953 + int size, int meta_bg) 1033 954 { 1034 955 struct ext4_sb_info *sbi = EXT4_SB(sb); 1035 - const ext4_group_t last = sbi->s_groups_count; 956 + ext4_group_t last; 1036 957 const int bpg = EXT4_BLOCKS_PER_GROUP(sb); 1037 958 unsigned three = 1; 1038 959 unsigned five = 5; 1039 960 unsigned seven = 7; 1040 - ext4_group_t group; 961 + ext4_group_t group = 0; 1041 962 int rest = sb->s_blocksize - size; 1042 963 handle_t *handle; 1043 964 int err = 0, err2; ··· 1049 970 goto exit_err; 1050 971 } 1051 972 1052 - ext4_superblock_csum_set(sb, (struct ext4_super_block *)data); 973 + if (meta_bg == 0) { 974 + group = ext4_list_backups(sb, &three, &five, &seven); 975 + last = sbi->s_groups_count; 976 + } else { 977 + group = ext4_meta_bg_first_group(sb, group) + 1; 978 + last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2); 979 + } 1053 980 1054 - while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) { 981 + while (group < sbi->s_groups_count) { 1055 982 struct buffer_head *bh; 983 + ext4_fsblk_t backup_block; 1056 984 1057 985 /* Out of journal space, and can't get more - abort - so sad */ 1058 986 if (ext4_handle_valid(handle) && ··· 1068 982 (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) 1069 983 break; 1070 984 1071 - bh = sb_getblk(sb, group * bpg + blk_off); 985 + if (meta_bg == 0) 986 + backup_block = group * bpg + blk_off; 987 + else 988 + backup_block = (ext4_group_first_block_no(sb, group) + 989 + ext4_bg_has_super(sb, group)); 990 + 991 + bh = sb_getblk(sb, backup_block); 1072 992 if (!bh) { 1073 993 err = -EIO; 1074 994 break; 1075 995 } 1076 - ext4_debug("update metadata backup %#04lx\n", 1077 - (unsigned long)bh->b_blocknr); 996 + ext4_debug("update metadata backup %llu(+%llu)\n", 997 + backup_block, backup_block - 998 + ext4_group_first_block_no(sb, group)); 1078 999 if ((err = ext4_journal_get_write_access(handle, bh))) 1079 1000 break; 1080 1001 lock_buffer(bh); ··· 1094 1001 if (unlikely(err)) 1095 1002 ext4_std_error(sb, err); 1096 1003 brelse(bh); 1004 + 1005 + if (meta_bg == 0) 1006 + group = ext4_list_backups(sb, &three, &five, &seven); 1007 + else if (group == last) 1008 + break; 1009 + else 1010 + group = last; 1097 1011 } 1098 1012 if ((err2 = ext4_journal_stop(handle)) && !err) 1099 1013 err = err2; ··· 1143 1043 struct ext4_super_block *es = sbi->s_es; 1144 1044 struct buffer_head *gdb_bh; 1145 1045 int i, gdb_off, gdb_num, err = 0; 1046 + int meta_bg; 1146 1047 1048 + meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); 1147 1049 for (i = 0; i < count; i++, group++) { 1148 1050 int reserved_gdb = ext4_bg_has_super(sb, group) ? 1149 1051 le16_to_cpu(es->s_reserved_gdt_blocks) : 0; ··· 1165 1063 1166 1064 if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) 1167 1065 err = reserve_backup_gdb(handle, resize_inode, group); 1168 - } else 1066 + } else if (meta_bg != 0) { 1067 + err = add_new_gdb_meta_bg(sb, handle, group); 1068 + } else { 1169 1069 err = add_new_gdb(handle, resize_inode, group); 1070 + } 1170 1071 if (err) 1171 1072 break; 1172 1073 } ··· 1181 1076 struct buffer_head *bh = sb_getblk(sb, block); 1182 1077 if (!bh) 1183 1078 return NULL; 1184 - 1185 - if (bitmap_uptodate(bh)) 1186 - return bh; 1187 - 1188 - lock_buffer(bh); 1189 - if (bh_submit_read(bh) < 0) { 1190 - unlock_buffer(bh); 1191 - brelse(bh); 1192 - return NULL; 1079 + if (!bh_uptodate_or_lock(bh)) { 1080 + if (bh_submit_read(bh) < 0) { 1081 + brelse(bh); 1082 + return NULL; 1083 + } 1193 1084 } 1194 - unlock_buffer(bh); 1195 1085 1196 1086 return bh; 1197 1087 } ··· 1261 1161 ext4_free_group_clusters_set(sb, gdp, 1262 1162 EXT4_B2C(sbi, group_data->free_blocks_count)); 1263 1163 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); 1164 + if (ext4_has_group_desc_csum(sb)) 1165 + ext4_itable_unused_set(sb, gdp, 1166 + EXT4_INODES_PER_GROUP(sb)); 1264 1167 gdp->bg_flags = cpu_to_le16(*bg_flags); 1265 1168 ext4_group_desc_csum_set(sb, group, gdp); 1266 1169 ··· 1319 1216 } 1320 1217 1321 1218 reserved_blocks = ext4_r_blocks_count(es) * 100; 1322 - do_div(reserved_blocks, ext4_blocks_count(es)); 1219 + reserved_blocks = div64_u64(reserved_blocks, ext4_blocks_count(es)); 1323 1220 reserved_blocks *= blocks_count; 1324 1221 do_div(reserved_blocks, 100); 1325 1222 ··· 1330 1227 le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) * 1331 1228 flex_gd->count); 1332 1229 1230 + ext4_debug("free blocks count %llu", ext4_free_blocks_count(es)); 1333 1231 /* 1334 1232 * We need to protect s_groups_count against other CPUs seeing 1335 1233 * inconsistent state in the superblock. ··· 1365 1261 percpu_counter_add(&sbi->s_freeinodes_counter, 1366 1262 EXT4_INODES_PER_GROUP(sb) * flex_gd->count); 1367 1263 1264 + ext4_debug("free blocks count %llu", 1265 + percpu_counter_read(&sbi->s_freeclusters_counter)); 1368 1266 if (EXT4_HAS_INCOMPAT_FEATURE(sb, 1369 1267 EXT4_FEATURE_INCOMPAT_FLEX_BG) && 1370 1268 sbi->s_log_groups_per_flex) { ··· 1455 1349 err = err2; 1456 1350 1457 1351 if (!err) { 1458 - int i; 1352 + int gdb_num = group / EXT4_DESC_PER_BLOCK(sb); 1353 + int gdb_num_end = ((group + flex_gd->count - 1) / 1354 + EXT4_DESC_PER_BLOCK(sb)); 1355 + int meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, 1356 + EXT4_FEATURE_INCOMPAT_META_BG); 1357 + sector_t old_gdb = 0; 1358 + 1459 1359 update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, 1460 - sizeof(struct ext4_super_block)); 1461 - for (i = 0; i < flex_gd->count; i++, group++) { 1360 + sizeof(struct ext4_super_block), 0); 1361 + for (; gdb_num <= gdb_num_end; gdb_num++) { 1462 1362 struct buffer_head *gdb_bh; 1463 - int gdb_num; 1464 - gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb); 1363 + 1465 1364 gdb_bh = sbi->s_group_desc[gdb_num]; 1365 + if (old_gdb == gdb_bh->b_blocknr) 1366 + continue; 1466 1367 update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data, 1467 - gdb_bh->b_size); 1368 + gdb_bh->b_size, meta_bg); 1369 + old_gdb = gdb_bh->b_blocknr; 1468 1370 } 1469 1371 } 1470 1372 exit: ··· 1516 1402 1517 1403 group_data[i].group = group + i; 1518 1404 group_data[i].blocks_count = blocks_per_group; 1519 - overhead = ext4_bg_has_super(sb, group + i) ? 1520 - (1 + ext4_bg_num_gdb(sb, group + i) + 1521 - le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; 1405 + overhead = ext4_group_overhead_blocks(sb, group + i); 1522 1406 group_data[i].free_blocks_count = blocks_per_group - overhead; 1523 1407 if (ext4_has_group_desc_csum(sb)) 1524 1408 flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | ··· 1604 1492 if (err) 1605 1493 goto out; 1606 1494 1495 + err = ext4_alloc_flex_bg_array(sb, input->group + 1); 1496 + if (err) 1497 + return err; 1498 + 1499 + err = ext4_mb_alloc_groupinfo(sb, input->group + 1); 1500 + if (err) 1501 + goto out; 1502 + 1607 1503 flex_gd.count = 1; 1608 1504 flex_gd.groups = input; 1609 1505 flex_gd.bg_flags = &bg_flags; ··· 1664 1544 err = err2; 1665 1545 1666 1546 if (!err) { 1547 + ext4_fsblk_t first_block; 1548 + first_block = ext4_group_first_block_no(sb, 0); 1667 1549 if (test_opt(sb, DEBUG)) 1668 1550 printk(KERN_DEBUG "EXT4-fs: extended group to %llu " 1669 1551 "blocks\n", ext4_blocks_count(es)); 1670 - update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es, 1671 - sizeof(struct ext4_super_block)); 1552 + update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block, 1553 + (char *)es, sizeof(struct ext4_super_block), 0); 1672 1554 } 1673 1555 return err; 1674 1556 } ··· 1753 1631 return err; 1754 1632 } /* ext4_group_extend */ 1755 1633 1634 + 1635 + static int num_desc_blocks(struct super_block *sb, ext4_group_t groups) 1636 + { 1637 + return (groups + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); 1638 + } 1639 + 1640 + /* 1641 + * Release the resize inode and drop the resize_inode feature if there 1642 + * are no more reserved gdt blocks, and then convert the file system 1643 + * to enable meta_bg 1644 + */ 1645 + static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) 1646 + { 1647 + handle_t *handle; 1648 + struct ext4_sb_info *sbi = EXT4_SB(sb); 1649 + struct ext4_super_block *es = sbi->s_es; 1650 + struct ext4_inode_info *ei = EXT4_I(inode); 1651 + ext4_fsblk_t nr; 1652 + int i, ret, err = 0; 1653 + int credits = 1; 1654 + 1655 + ext4_msg(sb, KERN_INFO, "Converting file system to meta_bg"); 1656 + if (inode) { 1657 + if (es->s_reserved_gdt_blocks) { 1658 + ext4_error(sb, "Unexpected non-zero " 1659 + "s_reserved_gdt_blocks"); 1660 + return -EPERM; 1661 + } 1662 + 1663 + /* Do a quick sanity check of the resize inode */ 1664 + if (inode->i_blocks != 1 << (inode->i_blkbits - 9)) 1665 + goto invalid_resize_inode; 1666 + for (i = 0; i < EXT4_N_BLOCKS; i++) { 1667 + if (i == EXT4_DIND_BLOCK) { 1668 + if (ei->i_data[i]) 1669 + continue; 1670 + else 1671 + goto invalid_resize_inode; 1672 + } 1673 + if (ei->i_data[i]) 1674 + goto invalid_resize_inode; 1675 + } 1676 + credits += 3; /* block bitmap, bg descriptor, resize inode */ 1677 + } 1678 + 1679 + handle = ext4_journal_start_sb(sb, credits); 1680 + if (IS_ERR(handle)) 1681 + return PTR_ERR(handle); 1682 + 1683 + err = ext4_journal_get_write_access(handle, sbi->s_sbh); 1684 + if (err) 1685 + goto errout; 1686 + 1687 + EXT4_CLEAR_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE); 1688 + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); 1689 + sbi->s_es->s_first_meta_bg = 1690 + cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count)); 1691 + 1692 + err = ext4_handle_dirty_super(handle, sb); 1693 + if (err) { 1694 + ext4_std_error(sb, err); 1695 + goto errout; 1696 + } 1697 + 1698 + if (inode) { 1699 + nr = le32_to_cpu(ei->i_data[EXT4_DIND_BLOCK]); 1700 + ext4_free_blocks(handle, inode, NULL, nr, 1, 1701 + EXT4_FREE_BLOCKS_METADATA | 1702 + EXT4_FREE_BLOCKS_FORGET); 1703 + ei->i_data[EXT4_DIND_BLOCK] = 0; 1704 + inode->i_blocks = 0; 1705 + 1706 + err = ext4_mark_inode_dirty(handle, inode); 1707 + if (err) 1708 + ext4_std_error(sb, err); 1709 + } 1710 + 1711 + errout: 1712 + ret = ext4_journal_stop(handle); 1713 + if (!err) 1714 + err = ret; 1715 + return ret; 1716 + 1717 + invalid_resize_inode: 1718 + ext4_error(sb, "corrupted/inconsistent resize inode"); 1719 + return -EINVAL; 1720 + } 1721 + 1756 1722 /* 1757 1723 * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count 1758 1724 * ··· 1853 1643 struct ext4_sb_info *sbi = EXT4_SB(sb); 1854 1644 struct ext4_super_block *es = sbi->s_es; 1855 1645 struct buffer_head *bh; 1856 - struct inode *resize_inode; 1857 - ext4_fsblk_t o_blocks_count; 1858 - ext4_group_t o_group; 1859 - ext4_group_t n_group; 1860 - ext4_grpblk_t offset, add; 1646 + struct inode *resize_inode = NULL; 1647 + ext4_grpblk_t add, offset; 1861 1648 unsigned long n_desc_blocks; 1862 1649 unsigned long o_desc_blocks; 1863 - unsigned long desc_blocks; 1864 - int err = 0, flexbg_size = 1; 1650 + ext4_group_t o_group; 1651 + ext4_group_t n_group; 1652 + ext4_fsblk_t o_blocks_count; 1653 + ext4_fsblk_t n_blocks_count_retry = 0; 1654 + unsigned long last_update_time = 0; 1655 + int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex; 1656 + int meta_bg; 1865 1657 1658 + /* See if the device is actually as big as what was requested */ 1659 + bh = sb_bread(sb, n_blocks_count - 1); 1660 + if (!bh) { 1661 + ext4_warning(sb, "can't read last block, resize aborted"); 1662 + return -ENOSPC; 1663 + } 1664 + brelse(bh); 1665 + 1666 + retry: 1866 1667 o_blocks_count = ext4_blocks_count(es); 1867 1668 1868 - if (test_opt(sb, DEBUG)) 1869 - ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu " 1870 - "to %llu blocks", o_blocks_count, n_blocks_count); 1669 + ext4_msg(sb, KERN_INFO, "resizing filesystem from %llu " 1670 + "to %llu blocks", o_blocks_count, n_blocks_count); 1871 1671 1872 1672 if (n_blocks_count < o_blocks_count) { 1873 1673 /* On-line shrinking not supported */ ··· 1892 1672 ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); 1893 1673 ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); 1894 1674 1895 - n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) / 1896 - EXT4_DESC_PER_BLOCK(sb); 1897 - o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / 1898 - EXT4_DESC_PER_BLOCK(sb); 1899 - desc_blocks = n_desc_blocks - o_desc_blocks; 1675 + n_desc_blocks = num_desc_blocks(sb, n_group + 1); 1676 + o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count); 1900 1677 1901 - if (desc_blocks && 1902 - (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) || 1903 - le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) { 1904 - ext4_warning(sb, "No reserved GDT blocks, can't resize"); 1905 - return -EPERM; 1678 + meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); 1679 + 1680 + if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) { 1681 + if (meta_bg) { 1682 + ext4_error(sb, "resize_inode and meta_bg enabled " 1683 + "simultaneously"); 1684 + return -EINVAL; 1685 + } 1686 + if (n_desc_blocks > o_desc_blocks + 1687 + le16_to_cpu(es->s_reserved_gdt_blocks)) { 1688 + n_blocks_count_retry = n_blocks_count; 1689 + n_desc_blocks = o_desc_blocks + 1690 + le16_to_cpu(es->s_reserved_gdt_blocks); 1691 + n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb); 1692 + n_blocks_count = n_group * EXT4_BLOCKS_PER_GROUP(sb); 1693 + n_group--; /* set to last group number */ 1694 + } 1695 + 1696 + if (!resize_inode) 1697 + resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); 1698 + if (IS_ERR(resize_inode)) { 1699 + ext4_warning(sb, "Error opening resize inode"); 1700 + return PTR_ERR(resize_inode); 1701 + } 1906 1702 } 1907 1703 1908 - resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); 1909 - if (IS_ERR(resize_inode)) { 1910 - ext4_warning(sb, "Error opening resize inode"); 1911 - return PTR_ERR(resize_inode); 1704 + if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) { 1705 + err = ext4_convert_meta_bg(sb, resize_inode); 1706 + if (err) 1707 + goto out; 1708 + if (resize_inode) { 1709 + iput(resize_inode); 1710 + resize_inode = NULL; 1711 + } 1712 + if (n_blocks_count_retry) { 1713 + n_blocks_count = n_blocks_count_retry; 1714 + n_blocks_count_retry = 0; 1715 + goto retry; 1716 + } 1912 1717 } 1913 - 1914 - /* See if the device is actually as big as what was requested */ 1915 - bh = sb_bread(sb, n_blocks_count - 1); 1916 - if (!bh) { 1917 - ext4_warning(sb, "can't read last block, resize aborted"); 1918 - return -ENOSPC; 1919 - } 1920 - brelse(bh); 1921 1718 1922 1719 /* extend the last group */ 1923 1720 if (n_group == o_group) ··· 1947 1710 goto out; 1948 1711 } 1949 1712 1950 - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && 1951 - es->s_log_groups_per_flex) 1952 - flexbg_size = 1 << es->s_log_groups_per_flex; 1713 + if (ext4_blocks_count(es) == n_blocks_count) 1714 + goto out; 1953 1715 1954 - o_blocks_count = ext4_blocks_count(es); 1955 - if (o_blocks_count == n_blocks_count) 1716 + err = ext4_alloc_flex_bg_array(sb, n_group + 1); 1717 + if (err) 1718 + return err; 1719 + 1720 + err = ext4_mb_alloc_groupinfo(sb, n_group + 1); 1721 + if (err) 1956 1722 goto out; 1957 1723 1958 1724 flex_gd = alloc_flex_gd(flexbg_size); ··· 1969 1729 */ 1970 1730 while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count, 1971 1731 flexbg_size)) { 1972 - ext4_alloc_group_tables(sb, flex_gd, flexbg_size); 1732 + if (jiffies - last_update_time > HZ * 10) { 1733 + if (last_update_time) 1734 + ext4_msg(sb, KERN_INFO, 1735 + "resized to %llu blocks", 1736 + ext4_blocks_count(es)); 1737 + last_update_time = jiffies; 1738 + } 1739 + if (ext4_alloc_group_tables(sb, flex_gd, flexbg_size) != 0) 1740 + break; 1973 1741 err = ext4_flex_group_add(sb, resize_inode, flex_gd); 1974 1742 if (unlikely(err)) 1975 1743 break; 1976 1744 } 1977 1745 1746 + if (!err && n_blocks_count_retry) { 1747 + n_blocks_count = n_blocks_count_retry; 1748 + n_blocks_count_retry = 0; 1749 + free_flex_gd(flex_gd); 1750 + flex_gd = NULL; 1751 + goto retry; 1752 + } 1753 + 1978 1754 out: 1979 1755 if (flex_gd) 1980 1756 free_flex_gd(flex_gd); 1981 - 1982 - iput(resize_inode); 1983 - if (test_opt(sb, DEBUG)) 1984 - ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu " 1985 - "upto %llu blocks", o_blocks_count, n_blocks_count); 1757 + if (resize_inode != NULL) 1758 + iput(resize_inode); 1759 + ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count); 1986 1760 return err; 1987 1761 }

+60 -32

fs/ext4/super.c

··· 420 420 */ 421 421 if (!es->s_error_count) 422 422 mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ); 423 - es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1); 423 + le32_add_cpu(&es->s_error_count, 1); 424 424 } 425 425 426 426 static void save_error_info(struct super_block *sb, const char *func, ··· 850 850 flush_workqueue(sbi->dio_unwritten_wq); 851 851 destroy_workqueue(sbi->dio_unwritten_wq); 852 852 853 - lock_super(sb); 854 853 if (sbi->s_journal) { 855 854 err = jbd2_journal_destroy(sbi->s_journal); 856 855 sbi->s_journal = NULL; ··· 916 917 * Now that we are completely done shutting down the 917 918 * superblock, we need to actually destroy the kobject. 918 919 */ 919 - unlock_super(sb); 920 920 kobject_put(&sbi->s_kobj); 921 921 wait_for_completion(&sbi->s_kobj_unregister); 922 922 if (sbi->s_chksum_driver) ··· 954 956 ei->jinode = NULL; 955 957 INIT_LIST_HEAD(&ei->i_completed_io_list); 956 958 spin_lock_init(&ei->i_completed_io_lock); 957 - ei->cur_aio_dio = NULL; 958 959 ei->i_sync_tid = 0; 959 960 ei->i_datasync_tid = 0; 960 961 atomic_set(&ei->i_ioend_count, 0); 961 - atomic_set(&ei->i_aiodio_unwritten, 0); 962 + atomic_set(&ei->i_unwritten, 0); 962 963 963 964 return &ei->vfs_inode; 964 965 } ··· 1221 1224 Opt_inode_readahead_blks, Opt_journal_ioprio, 1222 1225 Opt_dioread_nolock, Opt_dioread_lock, 1223 1226 Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, 1227 + Opt_max_dir_size_kb, 1224 1228 }; 1225 1229 1226 1230 static const match_table_t tokens = { ··· 1295 1297 {Opt_init_itable, "init_itable=%u"}, 1296 1298 {Opt_init_itable, "init_itable"}, 1297 1299 {Opt_noinit_itable, "noinit_itable"}, 1300 + {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, 1298 1301 {Opt_removed, "check=none"}, /* mount option from ext2/3 */ 1299 1302 {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ 1300 1303 {Opt_removed, "reservation"}, /* mount option from ext2/3 */ ··· 1476 1477 {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, 1477 1478 {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, 1478 1479 {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, 1480 + {Opt_max_dir_size_kb, 0, MOPT_GTE0}, 1479 1481 {Opt_err, 0, 0} 1480 1482 }; 1481 1483 ··· 1592 1592 if (!args->from) 1593 1593 arg = EXT4_DEF_LI_WAIT_MULT; 1594 1594 sbi->s_li_wait_mult = arg; 1595 + } else if (token == Opt_max_dir_size_kb) { 1596 + sbi->s_max_dir_size_kb = arg; 1595 1597 } else if (token == Opt_stripe) { 1596 1598 sbi->s_stripe = arg; 1597 1599 } else if (m->flags & MOPT_DATAJ) { ··· 1666 1664 * Initialize args struct so we know whether arg was 1667 1665 * found; some options take optional arguments. 1668 1666 */ 1669 - args[0].to = args[0].from = 0; 1667 + args[0].to = args[0].from = NULL; 1670 1668 token = match_token(p, tokens, args); 1671 1669 if (handle_mount_opt(sb, p, token, args, journal_devnum, 1672 1670 journal_ioprio, is_remount) < 0) ··· 1742 1740 1743 1741 static const char *token2str(int token) 1744 1742 { 1745 - static const struct match_token *t; 1743 + const struct match_token *t; 1746 1744 1747 1745 for (t = tokens; t->token != Opt_err; t++) 1748 1746 if (t->token == token && !strchr(t->pattern, '=')) ··· 1825 1823 if (nodefs || (test_opt(sb, INIT_INODE_TABLE) && 1826 1824 (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) 1827 1825 SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); 1826 + if (nodefs || sbi->s_max_dir_size_kb) 1827 + SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); 1828 1828 1829 1829 ext4_show_quota_options(seq, sb); 1830 1830 return 0; ··· 1918 1914 return res; 1919 1915 } 1920 1916 1917 + int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) 1918 + { 1919 + struct ext4_sb_info *sbi = EXT4_SB(sb); 1920 + struct flex_groups *new_groups; 1921 + int size; 1922 + 1923 + if (!sbi->s_log_groups_per_flex) 1924 + return 0; 1925 + 1926 + size = ext4_flex_group(sbi, ngroup - 1) + 1; 1927 + if (size <= sbi->s_flex_groups_allocated) 1928 + return 0; 1929 + 1930 + size = roundup_pow_of_two(size * sizeof(struct flex_groups)); 1931 + new_groups = ext4_kvzalloc(size, GFP_KERNEL); 1932 + if (!new_groups) { 1933 + ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups", 1934 + size / (int) sizeof(struct flex_groups)); 1935 + return -ENOMEM; 1936 + } 1937 + 1938 + if (sbi->s_flex_groups) { 1939 + memcpy(new_groups, sbi->s_flex_groups, 1940 + (sbi->s_flex_groups_allocated * 1941 + sizeof(struct flex_groups))); 1942 + ext4_kvfree(sbi->s_flex_groups); 1943 + } 1944 + sbi->s_flex_groups = new_groups; 1945 + sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups); 1946 + return 0; 1947 + } 1948 + 1921 1949 static int ext4_fill_flex_info(struct super_block *sb) 1922 1950 { 1923 1951 struct ext4_sb_info *sbi = EXT4_SB(sb); 1924 1952 struct ext4_group_desc *gdp = NULL; 1925 - ext4_group_t flex_group_count; 1926 1953 ext4_group_t flex_group; 1927 1954 unsigned int groups_per_flex = 0; 1928 - size_t size; 1929 - int i; 1955 + int i, err; 1930 1956 1931 1957 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; 1932 1958 if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { ··· 1965 1931 } 1966 1932 groups_per_flex = 1 << sbi->s_log_groups_per_flex; 1967 1933 1968 - /* We allocate both existing and potentially added groups */ 1969 - flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + 1970 - ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << 1971 - EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; 1972 - size = flex_group_count * sizeof(struct flex_groups); 1973 - sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL); 1974 - if (sbi->s_flex_groups == NULL) { 1975 - ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups", 1976 - flex_group_count); 1934 + err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); 1935 + if (err) 1977 1936 goto failed; 1978 - } 1979 1937 1980 1938 for (i = 0; i < sbi->s_groups_count; i++) { 1981 1939 gdp = ext4_get_group_desc(sb, i, NULL); ··· 2170 2144 } 2171 2145 2172 2146 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { 2173 - if (es->s_last_orphan) 2147 + /* don't clear list on RO mount w/ errors */ 2148 + if (es->s_last_orphan && !(s_flags & MS_RDONLY)) { 2174 2149 jbd_debug(1, "Errors on filesystem, " 2175 2150 "clearing orphan list.\n"); 2176 - es->s_last_orphan = 0; 2151 + es->s_last_orphan = 0; 2152 + } 2177 2153 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); 2178 2154 return; 2179 2155 } ··· 2556 2528 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 2557 2529 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 2558 2530 EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); 2531 + EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); 2559 2532 EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); 2560 2533 2561 2534 static struct attribute *ext4_attrs[] = { ··· 2572 2543 ATTR_LIST(mb_stream_req), 2573 2544 ATTR_LIST(mb_group_prealloc), 2574 2545 ATTR_LIST(max_writeback_mb_bump), 2546 + ATTR_LIST(extent_max_zeroout_kb), 2575 2547 ATTR_LIST(trigger_fs_error), 2576 2548 NULL, 2577 2549 }; ··· 2580 2550 /* Features this copy of ext4 supports */ 2581 2551 EXT4_INFO_ATTR(lazy_itable_init); 2582 2552 EXT4_INFO_ATTR(batched_discard); 2553 + EXT4_INFO_ATTR(meta_bg_resize); 2583 2554 2584 2555 static struct attribute *ext4_feat_attrs[] = { 2585 2556 ATTR_LIST(lazy_itable_init), 2586 2557 ATTR_LIST(batched_discard), 2558 + ATTR_LIST(meta_bg_resize), 2587 2559 NULL, 2588 2560 }; 2589 2561 ··· 3406 3374 * enable delayed allocation by default 3407 3375 * Use -o nodelalloc to turn it off 3408 3376 */ 3409 - if (!IS_EXT3_SB(sb) && 3377 + if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) && 3410 3378 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) 3411 3379 set_opt(sb, DELALLOC); 3412 3380 ··· 3775 3743 3776 3744 sbi->s_stripe = ext4_get_stripe_size(sbi); 3777 3745 sbi->s_max_writeback_mb_bump = 128; 3746 + sbi->s_extent_max_zeroout_kb = 32; 3778 3747 3779 3748 /* 3780 3749 * set up enough so that it can read an inode ··· 4552 4519 if (sb->s_flags & MS_RDONLY) 4553 4520 return 0; 4554 4521 4555 - lock_super(sb); 4556 4522 /* Reset the needs_recovery flag before the fs is unlocked. */ 4557 4523 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 4558 4524 ext4_commit_super(sb, 1); 4559 - unlock_super(sb); 4560 4525 return 0; 4561 4526 } 4562 4527 ··· 4590 4559 char *orig_data = kstrdup(data, GFP_KERNEL); 4591 4560 4592 4561 /* Store the original options */ 4593 - lock_super(sb); 4594 4562 old_sb_flags = sb->s_flags; 4595 4563 old_opts.s_mount_opt = sbi->s_mount_opt; 4596 4564 old_opts.s_mount_opt2 = sbi->s_mount_opt2; ··· 4731 4701 if (sbi->s_journal == NULL) 4732 4702 ext4_commit_super(sb, 1); 4733 4703 4734 - unlock_super(sb); 4735 4704 #ifdef CONFIG_QUOTA 4736 4705 /* Release old quota file names */ 4737 4706 for (i = 0; i < MAXQUOTAS; i++) ··· 4743 4714 else if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4744 4715 EXT4_FEATURE_RO_COMPAT_QUOTA)) { 4745 4716 err = ext4_enable_quotas(sb); 4746 - if (err) { 4747 - lock_super(sb); 4717 + if (err) 4748 4718 goto restore_opts; 4749 - } 4750 4719 } 4751 4720 } 4752 4721 #endif ··· 4771 4744 sbi->s_qf_names[i] = old_opts.s_qf_names[i]; 4772 4745 } 4773 4746 #endif 4774 - unlock_super(sb); 4775 4747 kfree(orig_data); 4776 4748 return err; 4777 4749 } ··· 5295 5269 if (err) 5296 5270 goto out6; 5297 5271 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); 5298 - if (!ext4_kset) 5272 + if (!ext4_kset) { 5273 + err = -ENOMEM; 5299 5274 goto out5; 5275 + } 5300 5276 ext4_proc_root = proc_mkdir("fs/ext4", NULL); 5301 5277 5302 5278 err = ext4_init_feat_adverts();

+1

fs/fs-writeback.c

··· 63 63 { 64 64 return test_bit(BDI_writeback_running, &bdi->state); 65 65 } 66 + EXPORT_SYMBOL(writeback_in_progress); 66 67 67 68 static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) 68 69 {

+29 -11

fs/jbd2/commit.c

··· 1014 1014 * there's no point in keeping a checkpoint record for 1015 1015 * it. */ 1016 1016 1017 - /* A buffer which has been freed while still being 1018 - * journaled by a previous transaction may end up still 1019 - * being dirty here, but we want to avoid writing back 1020 - * that buffer in the future after the "add to orphan" 1021 - * operation been committed, That's not only a performance 1022 - * gain, it also stops aliasing problems if the buffer is 1023 - * left behind for writeback and gets reallocated for another 1024 - * use in a different page. */ 1025 - if (buffer_freed(bh) && !jh->b_next_transaction) { 1026 - clear_buffer_freed(bh); 1027 - clear_buffer_jbddirty(bh); 1017 + /* 1018 + * A buffer which has been freed while still being journaled by 1019 + * a previous transaction. 1020 + */ 1021 + if (buffer_freed(bh)) { 1022 + /* 1023 + * If the running transaction is the one containing 1024 + * "add to orphan" operation (b_next_transaction != 1025 + * NULL), we have to wait for that transaction to 1026 + * commit before we can really get rid of the buffer. 1027 + * So just clear b_modified to not confuse transaction 1028 + * credit accounting and refile the buffer to 1029 + * BJ_Forget of the running transaction. If the just 1030 + * committed transaction contains "add to orphan" 1031 + * operation, we can completely invalidate the buffer 1032 + * now. We are rather through in that since the 1033 + * buffer may be still accessible when blocksize < 1034 + * pagesize and it is attached to the last partial 1035 + * page. 1036 + */ 1037 + jh->b_modified = 0; 1038 + if (!jh->b_next_transaction) { 1039 + clear_buffer_freed(bh); 1040 + clear_buffer_jbddirty(bh); 1041 + clear_buffer_mapped(bh); 1042 + clear_buffer_new(bh); 1043 + clear_buffer_req(bh); 1044 + bh->b_bdev = NULL; 1045 + } 1028 1046 } 1029 1047 1030 1048 if (buffer_jbddirty(bh)) {

+5

fs/jbd2/journal.c

··· 1354 1354 1355 1355 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); 1356 1356 read_lock(&journal->j_state_lock); 1357 + /* Is it already empty? */ 1358 + if (sb->s_start == 0) { 1359 + read_unlock(&journal->j_state_lock); 1360 + return; 1361 + } 1357 1362 jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n", 1358 1363 journal->j_tail_sequence); 1359 1364

+5 -2

fs/jbd2/recovery.c

··· 289 289 if (!err) 290 290 err = err2; 291 291 /* Make sure all replayed data is on permanent storage */ 292 - if (journal->j_flags & JBD2_BARRIER) 293 - blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); 292 + if (journal->j_flags & JBD2_BARRIER) { 293 + err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); 294 + if (!err) 295 + err = err2; 296 + } 294 297 return err; 295 298 } 296 299

+45 -20

fs/jbd2/transaction.c

··· 1841 1841 * We're outside-transaction here. Either or both of j_running_transaction 1842 1842 * and j_committing_transaction may be NULL. 1843 1843 */ 1844 - static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) 1844 + static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, 1845 + int partial_page) 1845 1846 { 1846 1847 transaction_t *transaction; 1847 1848 struct journal_head *jh; 1848 1849 int may_free = 1; 1849 - int ret; 1850 1850 1851 1851 BUFFER_TRACE(bh, "entry"); 1852 1852 1853 + retry: 1853 1854 /* 1854 1855 * It is safe to proceed here without the j_list_lock because the 1855 1856 * buffers cannot be stolen by try_to_free_buffers as long as we are ··· 1879 1878 * clear the buffer dirty bit at latest at the moment when the 1880 1879 * transaction marking the buffer as freed in the filesystem 1881 1880 * structures is committed because from that moment on the 1882 - * buffer can be reallocated and used by a different page. 1881 + * block can be reallocated and used by a different page. 1883 1882 * Since the block hasn't been freed yet but the inode has 1884 1883 * already been added to orphan list, it is safe for us to add 1885 1884 * the buffer to BJ_Forget list of the newest transaction. 1885 + * 1886 + * Also we have to clear buffer_mapped flag of a truncated buffer 1887 + * because the buffer_head may be attached to the page straddling 1888 + * i_size (can happen only when blocksize < pagesize) and thus the 1889 + * buffer_head can be reused when the file is extended again. So we end 1890 + * up keeping around invalidated buffers attached to transactions' 1891 + * BJ_Forget list just to stop checkpointing code from cleaning up 1892 + * the transaction this buffer was modified in. 1886 1893 */ 1887 1894 transaction = jh->b_transaction; 1888 1895 if (transaction == NULL) { ··· 1917 1908 * committed, the buffer won't be needed any 1918 1909 * longer. */ 1919 1910 JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); 1920 - ret = __dispose_buffer(jh, 1911 + may_free = __dispose_buffer(jh, 1921 1912 journal->j_running_transaction); 1922 - jbd2_journal_put_journal_head(jh); 1923 - spin_unlock(&journal->j_list_lock); 1924 - jbd_unlock_bh_state(bh); 1925 - write_unlock(&journal->j_state_lock); 1926 - return ret; 1913 + goto zap_buffer; 1927 1914 } else { 1928 1915 /* There is no currently-running transaction. So the 1929 1916 * orphan record which we wrote for this file must have ··· 1927 1922 * the committing transaction, if it exists. */ 1928 1923 if (journal->j_committing_transaction) { 1929 1924 JBUFFER_TRACE(jh, "give to committing trans"); 1930 - ret = __dispose_buffer(jh, 1925 + may_free = __dispose_buffer(jh, 1931 1926 journal->j_committing_transaction); 1932 - jbd2_journal_put_journal_head(jh); 1933 - spin_unlock(&journal->j_list_lock); 1934 - jbd_unlock_bh_state(bh); 1935 - write_unlock(&journal->j_state_lock); 1936 - return ret; 1927 + goto zap_buffer; 1937 1928 } else { 1938 1929 /* The orphan record's transaction has 1939 1930 * committed. We can cleanse this buffer */ ··· 1941 1940 JBUFFER_TRACE(jh, "on committing transaction"); 1942 1941 /* 1943 1942 * The buffer is committing, we simply cannot touch 1944 - * it. So we just set j_next_transaction to the 1945 - * running transaction (if there is one) and mark 1946 - * buffer as freed so that commit code knows it should 1947 - * clear dirty bits when it is done with the buffer. 1943 + * it. If the page is straddling i_size we have to wait 1944 + * for commit and try again. 1945 + */ 1946 + if (partial_page) { 1947 + tid_t tid = journal->j_committing_transaction->t_tid; 1948 + 1949 + jbd2_journal_put_journal_head(jh); 1950 + spin_unlock(&journal->j_list_lock); 1951 + jbd_unlock_bh_state(bh); 1952 + write_unlock(&journal->j_state_lock); 1953 + jbd2_log_wait_commit(journal, tid); 1954 + goto retry; 1955 + } 1956 + /* 1957 + * OK, buffer won't be reachable after truncate. We just set 1958 + * j_next_transaction to the running transaction (if there is 1959 + * one) and mark buffer as freed so that commit code knows it 1960 + * should clear dirty bits when it is done with the buffer. 1948 1961 */ 1949 1962 set_buffer_freed(bh); 1950 1963 if (journal->j_running_transaction && buffer_jbddirty(bh)) ··· 1981 1966 } 1982 1967 1983 1968 zap_buffer: 1969 + /* 1970 + * This is tricky. Although the buffer is truncated, it may be reused 1971 + * if blocksize < pagesize and it is attached to the page straddling 1972 + * EOF. Since the buffer might have been added to BJ_Forget list of the 1973 + * running transaction, journal_get_write_access() won't clear 1974 + * b_modified and credit accounting gets confused. So clear b_modified 1975 + * here. 1976 + */ 1977 + jh->b_modified = 0; 1984 1978 jbd2_journal_put_journal_head(jh); 1985 1979 zap_buffer_no_jh: 1986 1980 spin_unlock(&journal->j_list_lock); ··· 2041 2017 if (offset <= curr_off) { 2042 2018 /* This block is wholly outside the truncation point */ 2043 2019 lock_buffer(bh); 2044 - may_free &= journal_unmap_buffer(journal, bh); 2020 + may_free &= journal_unmap_buffer(journal, bh, 2021 + offset > 0); 2045 2022 unlock_buffer(bh); 2046 2023 } 2047 2024 curr_off = next_off;

+1

fs/nilfs2/file.c

··· 116 116 if (unlikely(ret)) 117 117 goto out; 118 118 119 + file_update_time(vma->vm_file); 119 120 ret = __block_page_mkwrite(vma, vmf, nilfs_get_block); 120 121 if (ret) { 121 122 nilfs_transaction_abort(inode->i_sb);

+1

include/linux/falloc.h

··· 3 3 4 4 #define FALLOC_FL_KEEP_SIZE 0x01 /* default is extend size */ 5 5 #define FALLOC_FL_PUNCH_HOLE 0x02 /* de-allocates range */ 6 + #define FALLOC_FL_NO_HIDE_STALE 0x04 /* reserved codepoint */ 6 7 7 8 #ifdef __KERNEL__ 8 9

+123 -123

include/trace/events/ext4.h

··· 26 26 TP_STRUCT__entry( 27 27 __field( dev_t, dev ) 28 28 __field( ino_t, ino ) 29 - __field( __u16, mode ) 30 29 __field( uid_t, uid ) 31 30 __field( gid_t, gid ) 32 31 __field( __u64, blocks ) 32 + __field( __u16, mode ) 33 33 ), 34 34 35 35 TP_fast_assign( 36 36 __entry->dev = inode->i_sb->s_dev; 37 37 __entry->ino = inode->i_ino; 38 - __entry->mode = inode->i_mode; 39 38 __entry->uid = i_uid_read(inode); 40 39 __entry->gid = i_gid_read(inode); 41 40 __entry->blocks = inode->i_blocks; 41 + __entry->mode = inode->i_mode; 42 42 ), 43 43 44 44 TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu", ··· 300 300 __field( long, pages_skipped ) 301 301 __field( loff_t, range_start ) 302 302 __field( loff_t, range_end ) 303 + __field( pgoff_t, writeback_index ) 303 304 __field( int, sync_mode ) 304 305 __field( char, for_kupdate ) 305 306 __field( char, range_cyclic ) 306 - __field( pgoff_t, writeback_index ) 307 307 ), 308 308 309 309 TP_fast_assign( ··· 313 313 __entry->pages_skipped = wbc->pages_skipped; 314 314 __entry->range_start = wbc->range_start; 315 315 __entry->range_end = wbc->range_end; 316 + __entry->writeback_index = inode->i_mapping->writeback_index; 316 317 __entry->sync_mode = wbc->sync_mode; 317 318 __entry->for_kupdate = wbc->for_kupdate; 318 319 __entry->range_cyclic = wbc->range_cyclic; 319 - __entry->writeback_index = inode->i_mapping->writeback_index; 320 320 ), 321 321 322 322 TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld " 323 - "range_start %lld range_end %lld sync_mode %d" 323 + "range_start %lld range_end %lld sync_mode %d " 324 324 "for_kupdate %d range_cyclic %d writeback_index %lu", 325 325 MAJOR(__entry->dev), MINOR(__entry->dev), 326 326 (unsigned long) __entry->ino, __entry->nr_to_write, ··· 382 382 __field( int, ret ) 383 383 __field( int, pages_written ) 384 384 __field( long, pages_skipped ) 385 - __field( int, sync_mode ) 386 385 __field( pgoff_t, writeback_index ) 386 + __field( int, sync_mode ) 387 387 ), 388 388 389 389 TP_fast_assign( ··· 392 392 __entry->ret = ret; 393 393 __entry->pages_written = pages_written; 394 394 __entry->pages_skipped = wbc->pages_skipped; 395 - __entry->sync_mode = wbc->sync_mode; 396 395 __entry->writeback_index = inode->i_mapping->writeback_index; 396 + __entry->sync_mode = wbc->sync_mode; 397 397 ), 398 398 399 399 TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld " ··· 411 411 TP_ARGS(page), 412 412 413 413 TP_STRUCT__entry( 414 - __field( pgoff_t, index ) 415 - __field( ino_t, ino ) 416 414 __field( dev_t, dev ) 415 + __field( ino_t, ino ) 416 + __field( pgoff_t, index ) 417 417 418 418 ), 419 419 420 420 TP_fast_assign( 421 - __entry->index = page->index; 422 - __entry->ino = page->mapping->host->i_ino; 423 421 __entry->dev = page->mapping->host->i_sb->s_dev; 422 + __entry->ino = page->mapping->host->i_ino; 423 + __entry->index = page->index; 424 424 ), 425 425 426 426 TP_printk("dev %d,%d ino %lu page_index %lu", ··· 456 456 TP_ARGS(page, offset), 457 457 458 458 TP_STRUCT__entry( 459 + __field( dev_t, dev ) 460 + __field( ino_t, ino ) 459 461 __field( pgoff_t, index ) 460 462 __field( unsigned long, offset ) 461 - __field( ino_t, ino ) 462 - __field( dev_t, dev ) 463 463 464 464 ), 465 465 466 466 TP_fast_assign( 467 + __entry->dev = page->mapping->host->i_sb->s_dev; 468 + __entry->ino = page->mapping->host->i_ino; 467 469 __entry->index = page->index; 468 470 __entry->offset = offset; 469 - __entry->ino = page->mapping->host->i_ino; 470 - __entry->dev = page->mapping->host->i_sb->s_dev; 471 471 ), 472 472 473 473 TP_printk("dev %d,%d ino %lu page_index %lu offset %lu", ··· 510 510 __field( dev_t, dev ) 511 511 __field( ino_t, ino ) 512 512 __field( __u64, pa_pstart ) 513 - __field( __u32, pa_len ) 514 513 __field( __u64, pa_lstart ) 514 + __field( __u32, pa_len ) 515 515 516 516 ), 517 517 ··· 519 519 __entry->dev = ac->ac_sb->s_dev; 520 520 __entry->ino = ac->ac_inode->i_ino; 521 521 __entry->pa_pstart = pa->pa_pstart; 522 - __entry->pa_len = pa->pa_len; 523 522 __entry->pa_lstart = pa->pa_lstart; 523 + __entry->pa_len = pa->pa_len; 524 524 ), 525 525 526 526 TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu", ··· 645 645 TP_STRUCT__entry( 646 646 __field( dev_t, dev ) 647 647 __field( ino_t, ino ) 648 - __field( unsigned int, flags ) 649 648 __field( unsigned int, len ) 650 649 __field( __u32, logical ) 651 650 __field( __u32, lleft ) ··· 652 653 __field( __u64, goal ) 653 654 __field( __u64, pleft ) 654 655 __field( __u64, pright ) 656 + __field( unsigned int, flags ) 655 657 ), 656 658 657 659 TP_fast_assign( 658 660 __entry->dev = ar->inode->i_sb->s_dev; 659 661 __entry->ino = ar->inode->i_ino; 660 - __entry->flags = ar->flags; 661 662 __entry->len = ar->len; 662 663 __entry->logical = ar->logical; 663 664 __entry->goal = ar->goal; ··· 665 666 __entry->lright = ar->lright; 666 667 __entry->pleft = ar->pleft; 667 668 __entry->pright = ar->pright; 669 + __entry->flags = ar->flags; 668 670 ), 669 671 670 672 TP_printk("dev %d,%d ino %lu flags %u len %u lblk %u goal %llu " ··· 686 686 __field( dev_t, dev ) 687 687 __field( ino_t, ino ) 688 688 __field( __u64, block ) 689 - __field( unsigned int, flags ) 690 689 __field( unsigned int, len ) 691 690 __field( __u32, logical ) 692 691 __field( __u32, lleft ) ··· 693 694 __field( __u64, goal ) 694 695 __field( __u64, pleft ) 695 696 __field( __u64, pright ) 697 + __field( unsigned int, flags ) 696 698 ), 697 699 698 700 TP_fast_assign( 699 701 __entry->dev = ar->inode->i_sb->s_dev; 700 702 __entry->ino = ar->inode->i_ino; 701 703 __entry->block = block; 702 - __entry->flags = ar->flags; 703 704 __entry->len = ar->len; 704 705 __entry->logical = ar->logical; 705 706 __entry->goal = ar->goal; ··· 707 708 __entry->lright = ar->lright; 708 709 __entry->pleft = ar->pleft; 709 710 __entry->pright = ar->pright; 711 + __entry->flags = ar->flags; 710 712 ), 711 713 712 714 TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %u " ··· 728 728 TP_STRUCT__entry( 729 729 __field( dev_t, dev ) 730 730 __field( ino_t, ino ) 731 - __field( __u16, mode ) 732 731 __field( __u64, block ) 733 732 __field( unsigned long, count ) 734 733 __field( int, flags ) 734 + __field( __u16, mode ) 735 735 ), 736 736 737 737 TP_fast_assign( 738 738 __entry->dev = inode->i_sb->s_dev; 739 739 __entry->ino = inode->i_ino; 740 - __entry->mode = inode->i_mode; 741 740 __entry->block = block; 742 741 __entry->count = count; 743 742 __entry->flags = flags; 743 + __entry->mode = inode->i_mode; 744 744 ), 745 745 746 746 TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %d", ··· 783 783 TP_ARGS(inode, ret), 784 784 785 785 TP_STRUCT__entry( 786 - __field( int, ret ) 787 - __field( ino_t, ino ) 788 786 __field( dev_t, dev ) 787 + __field( ino_t, ino ) 788 + __field( int, ret ) 789 789 ), 790 790 791 791 TP_fast_assign( 792 - __entry->ret = ret; 793 - __entry->ino = inode->i_ino; 794 792 __entry->dev = inode->i_sb->s_dev; 793 + __entry->ino = inode->i_ino; 794 + __entry->ret = ret; 795 795 ), 796 796 797 797 TP_printk("dev %d,%d ino %lu ret %d", ··· 854 854 TP_STRUCT__entry( 855 855 __field( dev_t, dev ) 856 856 __field( ino_t, ino ) 857 - __field( __u16, found ) 858 - __field( __u16, groups ) 859 - __field( __u16, buddy ) 860 - __field( __u16, flags ) 861 - __field( __u16, tail ) 862 - __field( __u8, cr ) 863 857 __field( __u32, orig_logical ) 864 858 __field( int, orig_start ) 865 859 __field( __u32, orig_group ) ··· 866 872 __field( int, result_start ) 867 873 __field( __u32, result_group ) 868 874 __field( int, result_len ) 875 + __field( __u16, found ) 876 + __field( __u16, groups ) 877 + __field( __u16, buddy ) 878 + __field( __u16, flags ) 879 + __field( __u16, tail ) 880 + __field( __u8, cr ) 869 881 ), 870 882 871 883 TP_fast_assign( 872 884 __entry->dev = ac->ac_inode->i_sb->s_dev; 873 885 __entry->ino = ac->ac_inode->i_ino; 874 - __entry->found = ac->ac_found; 875 - __entry->flags = ac->ac_flags; 876 - __entry->groups = ac->ac_groups_scanned; 877 - __entry->buddy = ac->ac_buddy; 878 - __entry->tail = ac->ac_tail; 879 - __entry->cr = ac->ac_criteria; 880 886 __entry->orig_logical = ac->ac_o_ex.fe_logical; 881 887 __entry->orig_start = ac->ac_o_ex.fe_start; 882 888 __entry->orig_group = ac->ac_o_ex.fe_group; ··· 889 895 __entry->result_start = ac->ac_f_ex.fe_start; 890 896 __entry->result_group = ac->ac_f_ex.fe_group; 891 897 __entry->result_len = ac->ac_f_ex.fe_len; 898 + __entry->found = ac->ac_found; 899 + __entry->flags = ac->ac_flags; 900 + __entry->groups = ac->ac_groups_scanned; 901 + __entry->buddy = ac->ac_buddy; 902 + __entry->tail = ac->ac_tail; 903 + __entry->cr = ac->ac_criteria; 892 904 ), 893 905 894 906 TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " ··· 1015 1015 TP_STRUCT__entry( 1016 1016 __field( dev_t, dev ) 1017 1017 __field( ino_t, ino ) 1018 - __field( __u16, mode ) 1019 - __field( int, is_metadata ) 1020 1018 __field( __u64, block ) 1019 + __field( int, is_metadata ) 1020 + __field( __u16, mode ) 1021 1021 ), 1022 1022 1023 1023 TP_fast_assign( 1024 1024 __entry->dev = inode->i_sb->s_dev; 1025 1025 __entry->ino = inode->i_ino; 1026 - __entry->mode = inode->i_mode; 1027 - __entry->is_metadata = is_metadata; 1028 1026 __entry->block = block; 1027 + __entry->is_metadata = is_metadata; 1028 + __entry->mode = inode->i_mode; 1029 1029 ), 1030 1030 1031 1031 TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu", ··· 1042 1042 TP_STRUCT__entry( 1043 1043 __field( dev_t, dev ) 1044 1044 __field( ino_t, ino ) 1045 - __field( __u16, mode ) 1046 1045 __field( __u64, i_blocks ) 1047 1046 __field( int, used_blocks ) 1048 1047 __field( int, reserved_data_blocks ) 1049 1048 __field( int, reserved_meta_blocks ) 1050 1049 __field( int, allocated_meta_blocks ) 1051 1050 __field( int, quota_claim ) 1051 + __field( __u16, mode ) 1052 1052 ), 1053 1053 1054 1054 TP_fast_assign( 1055 1055 __entry->dev = inode->i_sb->s_dev; 1056 1056 __entry->ino = inode->i_ino; 1057 - __entry->mode = inode->i_mode; 1058 1057 __entry->i_blocks = inode->i_blocks; 1059 1058 __entry->used_blocks = used_blocks; 1060 1059 __entry->reserved_data_blocks = ··· 1063 1064 __entry->allocated_meta_blocks = 1064 1065 EXT4_I(inode)->i_allocated_meta_blocks; 1065 1066 __entry->quota_claim = quota_claim; 1067 + __entry->mode = inode->i_mode; 1066 1068 ), 1067 1069 1068 1070 TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d " ··· 1085 1085 TP_STRUCT__entry( 1086 1086 __field( dev_t, dev ) 1087 1087 __field( ino_t, ino ) 1088 - __field( __u16, mode ) 1089 1088 __field( __u64, i_blocks ) 1090 1089 __field( int, md_needed ) 1091 1090 __field( int, reserved_data_blocks ) 1092 1091 __field( int, reserved_meta_blocks ) 1092 + __field( __u16, mode ) 1093 1093 ), 1094 1094 1095 1095 TP_fast_assign( 1096 1096 __entry->dev = inode->i_sb->s_dev; 1097 1097 __entry->ino = inode->i_ino; 1098 - __entry->mode = inode->i_mode; 1099 1098 __entry->i_blocks = inode->i_blocks; 1100 1099 __entry->md_needed = md_needed; 1101 1100 __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; 1102 1101 __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; 1102 + __entry->mode = inode->i_mode; 1103 1103 ), 1104 1104 1105 1105 TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu md_needed %d " ··· 1119 1119 TP_STRUCT__entry( 1120 1120 __field( dev_t, dev ) 1121 1121 __field( ino_t, ino ) 1122 - __field( __u16, mode ) 1123 1122 __field( __u64, i_blocks ) 1124 1123 __field( int, freed_blocks ) 1125 1124 __field( int, reserved_data_blocks ) 1126 1125 __field( int, reserved_meta_blocks ) 1127 1126 __field( int, allocated_meta_blocks ) 1127 + __field( __u16, mode ) 1128 1128 ), 1129 1129 1130 1130 TP_fast_assign( 1131 1131 __entry->dev = inode->i_sb->s_dev; 1132 1132 __entry->ino = inode->i_ino; 1133 - __entry->mode = inode->i_mode; 1134 1133 __entry->i_blocks = inode->i_blocks; 1135 1134 __entry->freed_blocks = freed_blocks; 1136 1135 __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; 1137 1136 __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; 1138 1137 __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; 1138 + __entry->mode = inode->i_mode; 1139 1139 ), 1140 1140 1141 1141 TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d " ··· 1203 1203 TP_ARGS(inode, offset, len, rw), 1204 1204 1205 1205 TP_STRUCT__entry( 1206 - __field( ino_t, ino ) 1207 1206 __field( dev_t, dev ) 1207 + __field( ino_t, ino ) 1208 1208 __field( loff_t, pos ) 1209 1209 __field( unsigned long, len ) 1210 1210 __field( int, rw ) 1211 1211 ), 1212 1212 1213 1213 TP_fast_assign( 1214 - __entry->ino = inode->i_ino; 1215 1214 __entry->dev = inode->i_sb->s_dev; 1215 + __entry->ino = inode->i_ino; 1216 1216 __entry->pos = offset; 1217 1217 __entry->len = len; 1218 1218 __entry->rw = rw; ··· 1231 1231 TP_ARGS(inode, offset, len, rw, ret), 1232 1232 1233 1233 TP_STRUCT__entry( 1234 - __field( ino_t, ino ) 1235 1234 __field( dev_t, dev ) 1235 + __field( ino_t, ino ) 1236 1236 __field( loff_t, pos ) 1237 1237 __field( unsigned long, len ) 1238 1238 __field( int, rw ) ··· 1240 1240 ), 1241 1241 1242 1242 TP_fast_assign( 1243 - __entry->ino = inode->i_ino; 1244 1243 __entry->dev = inode->i_sb->s_dev; 1244 + __entry->ino = inode->i_ino; 1245 1245 __entry->pos = offset; 1246 1246 __entry->len = len; 1247 1247 __entry->rw = rw; ··· 1261 1261 TP_ARGS(inode, offset, len, mode), 1262 1262 1263 1263 TP_STRUCT__entry( 1264 - __field( ino_t, ino ) 1265 1264 __field( dev_t, dev ) 1265 + __field( ino_t, ino ) 1266 1266 __field( loff_t, pos ) 1267 1267 __field( loff_t, len ) 1268 1268 __field( int, mode ) 1269 1269 ), 1270 1270 1271 1271 TP_fast_assign( 1272 - __entry->ino = inode->i_ino; 1273 1272 __entry->dev = inode->i_sb->s_dev; 1273 + __entry->ino = inode->i_ino; 1274 1274 __entry->pos = offset; 1275 1275 __entry->len = len; 1276 1276 __entry->mode = mode; ··· 1289 1289 TP_ARGS(inode, offset, max_blocks, ret), 1290 1290 1291 1291 TP_STRUCT__entry( 1292 - __field( ino_t, ino ) 1293 1292 __field( dev_t, dev ) 1293 + __field( ino_t, ino ) 1294 1294 __field( loff_t, pos ) 1295 1295 __field( unsigned int, blocks ) 1296 1296 __field( int, ret ) 1297 1297 ), 1298 1298 1299 1299 TP_fast_assign( 1300 - __entry->ino = inode->i_ino; 1301 1300 __entry->dev = inode->i_sb->s_dev; 1301 + __entry->ino = inode->i_ino; 1302 1302 __entry->pos = offset; 1303 1303 __entry->blocks = max_blocks; 1304 1304 __entry->ret = ret; ··· 1317 1317 TP_ARGS(parent, dentry), 1318 1318 1319 1319 TP_STRUCT__entry( 1320 - __field( ino_t, parent ) 1321 - __field( ino_t, ino ) 1322 - __field( loff_t, size ) 1323 1320 __field( dev_t, dev ) 1321 + __field( ino_t, ino ) 1322 + __field( ino_t, parent ) 1323 + __field( loff_t, size ) 1324 1324 ), 1325 1325 1326 1326 TP_fast_assign( 1327 - __entry->parent = parent->i_ino; 1328 - __entry->ino = dentry->d_inode->i_ino; 1329 - __entry->size = dentry->d_inode->i_size; 1330 1327 __entry->dev = dentry->d_inode->i_sb->s_dev; 1328 + __entry->ino = dentry->d_inode->i_ino; 1329 + __entry->parent = parent->i_ino; 1330 + __entry->size = dentry->d_inode->i_size; 1331 1331 ), 1332 1332 1333 1333 TP_printk("dev %d,%d ino %lu size %lld parent %lu", ··· 1342 1342 TP_ARGS(dentry, ret), 1343 1343 1344 1344 TP_STRUCT__entry( 1345 - __field( ino_t, ino ) 1346 1345 __field( dev_t, dev ) 1346 + __field( ino_t, ino ) 1347 1347 __field( int, ret ) 1348 1348 ), 1349 1349 1350 1350 TP_fast_assign( 1351 - __entry->ino = dentry->d_inode->i_ino; 1352 1351 __entry->dev = dentry->d_inode->i_sb->s_dev; 1352 + __entry->ino = dentry->d_inode->i_ino; 1353 1353 __entry->ret = ret; 1354 1354 ), 1355 1355 ··· 1365 1365 TP_ARGS(inode), 1366 1366 1367 1367 TP_STRUCT__entry( 1368 - __field( ino_t, ino ) 1369 - __field( dev_t, dev ) 1368 + __field( dev_t, dev ) 1369 + __field( ino_t, ino ) 1370 1370 __field( __u64, blocks ) 1371 1371 ), 1372 1372 1373 1373 TP_fast_assign( 1374 - __entry->ino = inode->i_ino; 1375 1374 __entry->dev = inode->i_sb->s_dev; 1375 + __entry->ino = inode->i_ino; 1376 1376 __entry->blocks = inode->i_blocks; 1377 1377 ), 1378 1378 ··· 1403 1403 TP_ARGS(inode, map, ux), 1404 1404 1405 1405 TP_STRUCT__entry( 1406 - __field( ino_t, ino ) 1407 1406 __field( dev_t, dev ) 1407 + __field( ino_t, ino ) 1408 1408 __field( ext4_lblk_t, m_lblk ) 1409 1409 __field( unsigned, m_len ) 1410 1410 __field( ext4_lblk_t, u_lblk ) ··· 1413 1413 ), 1414 1414 1415 1415 TP_fast_assign( 1416 - __entry->ino = inode->i_ino; 1417 1416 __entry->dev = inode->i_sb->s_dev; 1417 + __entry->ino = inode->i_ino; 1418 1418 __entry->m_lblk = map->m_lblk; 1419 1419 __entry->m_len = map->m_len; 1420 1420 __entry->u_lblk = le32_to_cpu(ux->ee_block); ··· 1441 1441 TP_ARGS(inode, map, ux, ix), 1442 1442 1443 1443 TP_STRUCT__entry( 1444 - __field( ino_t, ino ) 1445 1444 __field( dev_t, dev ) 1445 + __field( ino_t, ino ) 1446 1446 __field( ext4_lblk_t, m_lblk ) 1447 1447 __field( unsigned, m_len ) 1448 1448 __field( ext4_lblk_t, u_lblk ) ··· 1454 1454 ), 1455 1455 1456 1456 TP_fast_assign( 1457 - __entry->ino = inode->i_ino; 1458 1457 __entry->dev = inode->i_sb->s_dev; 1458 + __entry->ino = inode->i_ino; 1459 1459 __entry->m_lblk = map->m_lblk; 1460 1460 __entry->m_len = map->m_len; 1461 1461 __entry->u_lblk = le32_to_cpu(ux->ee_block); ··· 1483 1483 TP_ARGS(inode, lblk, len, flags), 1484 1484 1485 1485 TP_STRUCT__entry( 1486 - __field( ino_t, ino ) 1487 - __field( dev_t, dev ) 1486 + __field( dev_t, dev ) 1487 + __field( ino_t, ino ) 1488 1488 __field( ext4_lblk_t, lblk ) 1489 1489 __field( unsigned int, len ) 1490 1490 __field( unsigned int, flags ) 1491 1491 ), 1492 1492 1493 1493 TP_fast_assign( 1494 - __entry->ino = inode->i_ino; 1495 1494 __entry->dev = inode->i_sb->s_dev; 1495 + __entry->ino = inode->i_ino; 1496 1496 __entry->lblk = lblk; 1497 1497 __entry->len = len; 1498 1498 __entry->flags = flags; ··· 1525 1525 TP_ARGS(inode, lblk, pblk, len, ret), 1526 1526 1527 1527 TP_STRUCT__entry( 1528 - __field( ino_t, ino ) 1529 1528 __field( dev_t, dev ) 1530 - __field( ext4_lblk_t, lblk ) 1529 + __field( ino_t, ino ) 1531 1530 __field( ext4_fsblk_t, pblk ) 1531 + __field( ext4_lblk_t, lblk ) 1532 1532 __field( unsigned int, len ) 1533 1533 __field( int, ret ) 1534 1534 ), 1535 1535 1536 1536 TP_fast_assign( 1537 - __entry->ino = inode->i_ino; 1538 1537 __entry->dev = inode->i_sb->s_dev; 1539 - __entry->lblk = lblk; 1538 + __entry->ino = inode->i_ino; 1540 1539 __entry->pblk = pblk; 1540 + __entry->lblk = lblk; 1541 1541 __entry->len = len; 1542 1542 __entry->ret = ret; 1543 1543 ), ··· 1569 1569 TP_ARGS(inode, lblk, pblk), 1570 1570 1571 1571 TP_STRUCT__entry( 1572 - __field( ino_t, ino ) 1573 1572 __field( dev_t, dev ) 1574 - __field( ext4_lblk_t, lblk ) 1573 + __field( ino_t, ino ) 1575 1574 __field( ext4_fsblk_t, pblk ) 1575 + __field( ext4_lblk_t, lblk ) 1576 1576 ), 1577 1577 1578 1578 TP_fast_assign( 1579 - __entry->ino = inode->i_ino; 1580 1579 __entry->dev = inode->i_sb->s_dev; 1581 - __entry->lblk = lblk; 1580 + __entry->ino = inode->i_ino; 1582 1581 __entry->pblk = pblk; 1582 + __entry->lblk = lblk; 1583 1583 ), 1584 1584 1585 1585 TP_printk("dev %d,%d ino %lu lblk %u pblk %llu", ··· 1594 1594 TP_ARGS(inode), 1595 1595 1596 1596 TP_STRUCT__entry( 1597 - __field( ino_t, ino ) 1598 1597 __field( dev_t, dev ) 1598 + __field( ino_t, ino ) 1599 1599 ), 1600 1600 1601 1601 TP_fast_assign( 1602 - __entry->ino = inode->i_ino; 1603 1602 __entry->dev = inode->i_sb->s_dev; 1603 + __entry->ino = inode->i_ino; 1604 1604 ), 1605 1605 1606 1606 TP_printk("dev %d,%d ino %ld", ··· 1615 1615 1616 1616 TP_STRUCT__entry( 1617 1617 __field( dev_t, dev ) 1618 - __field( int, nblocks ) 1619 1618 __field(unsigned long, ip ) 1619 + __field( int, nblocks ) 1620 1620 ), 1621 1621 1622 1622 TP_fast_assign( 1623 1623 __entry->dev = sb->s_dev; 1624 - __entry->nblocks = nblocks; 1625 1624 __entry->ip = IP; 1625 + __entry->nblocks = nblocks; 1626 1626 ), 1627 1627 1628 1628 TP_printk("dev %d,%d nblocks %d caller %pF", ··· 1686 1686 TP_ARGS(inode, map, allocated, newblock), 1687 1687 1688 1688 TP_STRUCT__entry( 1689 - __field( ino_t, ino ) 1690 1689 __field( dev_t, dev ) 1690 + __field( ino_t, ino ) 1691 + __field( int, flags ) 1691 1692 __field( ext4_lblk_t, lblk ) 1692 1693 __field( ext4_fsblk_t, pblk ) 1693 1694 __field( unsigned int, len ) 1694 - __field( int, flags ) 1695 1695 __field( unsigned int, allocated ) 1696 1696 __field( ext4_fsblk_t, newblk ) 1697 1697 ), 1698 1698 1699 1699 TP_fast_assign( 1700 - __entry->ino = inode->i_ino; 1701 1700 __entry->dev = inode->i_sb->s_dev; 1701 + __entry->ino = inode->i_ino; 1702 + __entry->flags = map->m_flags; 1702 1703 __entry->lblk = map->m_lblk; 1703 1704 __entry->pblk = map->m_pblk; 1704 1705 __entry->len = map->m_len; 1705 - __entry->flags = map->m_flags; 1706 1706 __entry->allocated = allocated; 1707 1707 __entry->newblk = newblock; 1708 1708 ), ··· 1724 1724 1725 1725 TP_STRUCT__entry( 1726 1726 __field( dev_t, dev ) 1727 + __field( unsigned int, flags ) 1727 1728 __field( ext4_lblk_t, lblk ) 1728 1729 __field( ext4_fsblk_t, pblk ) 1729 1730 __field( unsigned int, len ) 1730 - __field( unsigned int, flags ) 1731 1731 __field( int, ret ) 1732 1732 ), 1733 1733 1734 1734 TP_fast_assign( 1735 1735 __entry->dev = sb->s_dev; 1736 + __entry->flags = map->m_flags; 1736 1737 __entry->lblk = map->m_lblk; 1737 1738 __entry->pblk = map->m_pblk; 1738 1739 __entry->len = map->m_len; 1739 - __entry->flags = map->m_flags; 1740 1740 __entry->ret = ret; 1741 1741 ), 1742 1742 ··· 1753 1753 TP_ARGS(inode, lblk, len, start), 1754 1754 1755 1755 TP_STRUCT__entry( 1756 - __field( ino_t, ino ) 1757 1756 __field( dev_t, dev ) 1757 + __field( ino_t, ino ) 1758 1758 __field( ext4_lblk_t, lblk ) 1759 1759 __field( unsigned int, len ) 1760 1760 __field( ext4_fsblk_t, start ) 1761 1761 ), 1762 1762 1763 1763 TP_fast_assign( 1764 - __entry->ino = inode->i_ino; 1765 1764 __entry->dev = inode->i_sb->s_dev; 1765 + __entry->ino = inode->i_ino; 1766 1766 __entry->lblk = lblk; 1767 1767 __entry->len = len; 1768 1768 __entry->start = start; ··· 1782 1782 TP_ARGS(inode, lblk, ret), 1783 1783 1784 1784 TP_STRUCT__entry( 1785 - __field( ino_t, ino ) 1786 1785 __field( dev_t, dev ) 1786 + __field( ino_t, ino ) 1787 1787 __field( ext4_lblk_t, lblk ) 1788 1788 __field( int, ret ) 1789 1789 ), 1790 1790 1791 1791 TP_fast_assign( 1792 - __entry->ino = inode->i_ino; 1793 1792 __entry->dev = inode->i_sb->s_dev; 1793 + __entry->ino = inode->i_ino; 1794 1794 __entry->lblk = lblk; 1795 1795 __entry->ret = ret; 1796 1796 ), ··· 1810 1810 TP_ARGS(inode, from, to, reverse, found, found_blk), 1811 1811 1812 1812 TP_STRUCT__entry( 1813 - __field( ino_t, ino ) 1814 1813 __field( dev_t, dev ) 1814 + __field( ino_t, ino ) 1815 1815 __field( ext4_lblk_t, from ) 1816 1816 __field( ext4_lblk_t, to ) 1817 1817 __field( int, reverse ) ··· 1820 1820 ), 1821 1821 1822 1822 TP_fast_assign( 1823 - __entry->ino = inode->i_ino; 1824 1823 __entry->dev = inode->i_sb->s_dev; 1824 + __entry->ino = inode->i_ino; 1825 1825 __entry->from = from; 1826 1826 __entry->to = to; 1827 1827 __entry->reverse = reverse; ··· 1844 1844 TP_ARGS(inode, lblk, len), 1845 1845 1846 1846 TP_STRUCT__entry( 1847 - __field( ino_t, ino ) 1848 1847 __field( dev_t, dev ) 1848 + __field( ino_t, ino ) 1849 1849 __field( ext4_lblk_t, lblk ) 1850 1850 __field( unsigned int, len ) 1851 1851 ), 1852 1852 1853 1853 TP_fast_assign( 1854 - __entry->ino = inode->i_ino; 1855 1854 __entry->dev = inode->i_sb->s_dev; 1855 + __entry->ino = inode->i_ino; 1856 1856 __entry->lblk = lblk; 1857 1857 __entry->len = len; 1858 1858 ), ··· 1871 1871 TP_ARGS(inode, lblk, pblk, len), 1872 1872 1873 1873 TP_STRUCT__entry( 1874 - __field( ino_t, ino ) 1875 1874 __field( dev_t, dev ) 1876 - __field( ext4_lblk_t, lblk ) 1875 + __field( ino_t, ino ) 1877 1876 __field( ext4_fsblk_t, pblk ) 1877 + __field( ext4_lblk_t, lblk ) 1878 1878 __field( unsigned short, len ) 1879 1879 ), 1880 1880 1881 1881 TP_fast_assign( 1882 - __entry->ino = inode->i_ino; 1883 1882 __entry->dev = inode->i_sb->s_dev; 1884 - __entry->lblk = lblk; 1883 + __entry->ino = inode->i_ino; 1885 1884 __entry->pblk = pblk; 1885 + __entry->lblk = lblk; 1886 1886 __entry->len = len; 1887 1887 ), 1888 1888 ··· 1902 1902 TP_ARGS(inode, ex, from, to, partial_cluster), 1903 1903 1904 1904 TP_STRUCT__entry( 1905 - __field( ino_t, ino ) 1906 1905 __field( dev_t, dev ) 1907 - __field( ext4_lblk_t, ee_lblk ) 1908 - __field( ext4_fsblk_t, ee_pblk ) 1909 - __field( unsigned short, ee_len ) 1906 + __field( ino_t, ino ) 1910 1907 __field( ext4_lblk_t, from ) 1911 1908 __field( ext4_lblk_t, to ) 1912 1909 __field( ext4_fsblk_t, partial ) 1910 + __field( ext4_fsblk_t, ee_pblk ) 1911 + __field( ext4_lblk_t, ee_lblk ) 1912 + __field( unsigned short, ee_len ) 1913 1913 ), 1914 1914 1915 1915 TP_fast_assign( 1916 - __entry->ino = inode->i_ino; 1917 1916 __entry->dev = inode->i_sb->s_dev; 1918 - __entry->ee_lblk = cpu_to_le32(ex->ee_block); 1919 - __entry->ee_pblk = ext4_ext_pblock(ex); 1920 - __entry->ee_len = ext4_ext_get_actual_len(ex); 1917 + __entry->ino = inode->i_ino; 1921 1918 __entry->from = from; 1922 1919 __entry->to = to; 1923 1920 __entry->partial = partial_cluster; 1921 + __entry->ee_pblk = ext4_ext_pblock(ex); 1922 + __entry->ee_lblk = cpu_to_le32(ex->ee_block); 1923 + __entry->ee_len = ext4_ext_get_actual_len(ex); 1924 1924 ), 1925 1925 1926 1926 TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]" ··· 1942 1942 TP_ARGS(inode, start, ex, partial_cluster), 1943 1943 1944 1944 TP_STRUCT__entry( 1945 - __field( ino_t, ino ) 1946 1945 __field( dev_t, dev ) 1946 + __field( ino_t, ino ) 1947 + __field( ext4_fsblk_t, partial ) 1947 1948 __field( ext4_lblk_t, start ) 1948 1949 __field( ext4_lblk_t, ee_lblk ) 1949 1950 __field( ext4_fsblk_t, ee_pblk ) 1950 1951 __field( short, ee_len ) 1951 - __field( ext4_fsblk_t, partial ) 1952 1952 ), 1953 1953 1954 1954 TP_fast_assign( 1955 - __entry->ino = inode->i_ino; 1956 1955 __entry->dev = inode->i_sb->s_dev; 1956 + __entry->ino = inode->i_ino; 1957 + __entry->partial = partial_cluster; 1957 1958 __entry->start = start; 1958 1959 __entry->ee_lblk = le32_to_cpu(ex->ee_block); 1959 1960 __entry->ee_pblk = ext4_ext_pblock(ex); 1960 1961 __entry->ee_len = ext4_ext_get_actual_len(ex); 1961 - __entry->partial = partial_cluster; 1962 1962 ), 1963 1963 1964 1964 TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]" ··· 1978 1978 TP_ARGS(inode, pblk), 1979 1979 1980 1980 TP_STRUCT__entry( 1981 - __field( ino_t, ino ) 1982 1981 __field( dev_t, dev ) 1982 + __field( ino_t, ino ) 1983 1983 __field( ext4_fsblk_t, pblk ) 1984 1984 ), 1985 1985 1986 1986 TP_fast_assign( 1987 - __entry->ino = inode->i_ino; 1988 1987 __entry->dev = inode->i_sb->s_dev; 1988 + __entry->ino = inode->i_ino; 1989 1989 __entry->pblk = pblk; 1990 1990 ), 1991 1991 ··· 2001 2001 TP_ARGS(inode, start, depth), 2002 2002 2003 2003 TP_STRUCT__entry( 2004 - __field( ino_t, ino ) 2005 2004 __field( dev_t, dev ) 2005 + __field( ino_t, ino ) 2006 2006 __field( ext4_lblk_t, start ) 2007 2007 __field( int, depth ) 2008 2008 ), 2009 2009 2010 2010 TP_fast_assign( 2011 - __entry->ino = inode->i_ino; 2012 2011 __entry->dev = inode->i_sb->s_dev; 2012 + __entry->ino = inode->i_ino; 2013 2013 __entry->start = start; 2014 2014 __entry->depth = depth; 2015 2015 ), ··· 2028 2028 TP_ARGS(inode, start, depth, partial, eh_entries), 2029 2029 2030 2030 TP_STRUCT__entry( 2031 - __field( ino_t, ino ) 2032 2031 __field( dev_t, dev ) 2032 + __field( ino_t, ino ) 2033 2033 __field( ext4_lblk_t, start ) 2034 2034 __field( int, depth ) 2035 2035 __field( ext4_lblk_t, partial ) ··· 2037 2037 ), 2038 2038 2039 2039 TP_fast_assign( 2040 - __entry->ino = inode->i_ino; 2041 2040 __entry->dev = inode->i_sb->s_dev; 2041 + __entry->ino = inode->i_ino; 2042 2042 __entry->start = start; 2043 2043 __entry->depth = depth; 2044 2044 __entry->partial = partial;