Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

+29 -19

fs/buffer.c

··· 993 993 */ 994 994 static int 995 995 grow_dev_page(struct block_device *bdev, sector_t block, 996 - pgoff_t index, int size, int sizebits) 996 + pgoff_t index, int size, int sizebits, gfp_t gfp) 997 997 { 998 998 struct inode *inode = bdev->bd_inode; 999 999 struct page *page; ··· 1002 1002 int ret = 0; /* Will call free_more_memory() */ 1003 1003 gfp_t gfp_mask; 1004 1004 1005 - gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS; 1006 - gfp_mask |= __GFP_MOVABLE; 1005 + gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp; 1006 + 1007 1007 /* 1008 1008 * XXX: __getblk_slow() can not really deal with failure and 1009 1009 * will endlessly loop on improvised global reclaim. Prefer ··· 1060 1060 * that page was dirty, the buffers are set dirty also. 1061 1061 */ 1062 1062 static int 1063 - grow_buffers(struct block_device *bdev, sector_t block, int size) 1063 + grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp) 1064 1064 { 1065 1065 pgoff_t index; 1066 1066 int sizebits; ··· 1087 1087 } 1088 1088 1089 1089 /* Create a page with the proper size buffers.. */ 1090 - return grow_dev_page(bdev, block, index, size, sizebits); 1090 + return grow_dev_page(bdev, block, index, size, sizebits, gfp); 1091 1091 } 1092 1092 1093 - static struct buffer_head * 1094 - __getblk_slow(struct block_device *bdev, sector_t block, int size) 1093 + struct buffer_head * 1094 + __getblk_slow(struct block_device *bdev, sector_t block, 1095 + unsigned size, gfp_t gfp) 1095 1096 { 1096 1097 /* Size must be multiple of hard sectorsize */ 1097 1098 if (unlikely(size & (bdev_logical_block_size(bdev)-1) || ··· 1114 1113 if (bh) 1115 1114 return bh; 1116 1115 1117 - ret = grow_buffers(bdev, block, size); 1116 + ret = grow_buffers(bdev, block, size, gfp); 1118 1117 if (ret < 0) 1119 1118 return NULL; 1120 1119 if (ret == 0) 1121 1120 free_more_memory(); 1122 1121 } 1123 1122 } 1123 + EXPORT_SYMBOL(__getblk_slow); 1124 1124 1125 1125 /* 1126 1126 * The relationship between dirty buffers and dirty pages: ··· 1375 1373 EXPORT_SYMBOL(__find_get_block); 1376 1374 1377 1375 /* 1378 - * __getblk will locate (and, if necessary, create) the buffer_head 1376 + * __getblk_gfp() will locate (and, if necessary, create) the buffer_head 1379 1377 * which corresponds to the passed block_device, block and size. The 1380 1378 * returned buffer has its reference count incremented. 1381 1379 * 1382 - * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() 1383 - * attempt is failing. FIXME, perhaps? 1380 + * __getblk_gfp() will lock up the machine if grow_dev_page's 1381 + * try_to_free_buffers() attempt is failing. FIXME, perhaps? 1384 1382 */ 1385 1383 struct buffer_head * 1386 - __getblk(struct block_device *bdev, sector_t block, unsigned size) 1384 + __getblk_gfp(struct block_device *bdev, sector_t block, 1385 + unsigned size, gfp_t gfp) 1387 1386 { 1388 1387 struct buffer_head *bh = __find_get_block(bdev, block, size); 1389 1388 1390 1389 might_sleep(); 1391 1390 if (bh == NULL) 1392 - bh = __getblk_slow(bdev, block, size); 1391 + bh = __getblk_slow(bdev, block, size, gfp); 1393 1392 return bh; 1394 1393 } 1395 - EXPORT_SYMBOL(__getblk); 1394 + EXPORT_SYMBOL(__getblk_gfp); 1396 1395 1397 1396 /* 1398 1397 * Do async read-ahead on a buffer.. ··· 1409 1406 EXPORT_SYMBOL(__breadahead); 1410 1407 1411 1408 /** 1412 - * __bread() - reads a specified block and returns the bh 1409 + * __bread_gfp() - reads a specified block and returns the bh 1413 1410 * @bdev: the block_device to read from 1414 1411 * @block: number of block 1415 1412 * @size: size (in bytes) to read 1416 - * 1413 + * @gfp: page allocation flag 1414 + * 1417 1415 * Reads a specified block, and returns buffer head that contains it. 1416 + * The page cache can be allocated from non-movable area 1417 + * not to prevent page migration if you set gfp to zero. 1418 1418 * It returns NULL if the block was unreadable. 1419 1419 */ 1420 1420 struct buffer_head * 1421 - __bread(struct block_device *bdev, sector_t block, unsigned size) 1421 + __bread_gfp(struct block_device *bdev, sector_t block, 1422 + unsigned size, gfp_t gfp) 1422 1423 { 1423 - struct buffer_head *bh = __getblk(bdev, block, size); 1424 + struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp); 1424 1425 1425 1426 if (likely(bh) && !buffer_uptodate(bh)) 1426 1427 bh = __bread_slow(bh); 1427 1428 return bh; 1428 1429 } 1429 - EXPORT_SYMBOL(__bread); 1430 + EXPORT_SYMBOL(__bread_gfp); 1430 1431 1431 1432 /* 1432 1433 * invalidate_bh_lrus() is called rarely - but not only at unmount. ··· 2089 2082 struct page *page, void *fsdata) 2090 2083 { 2091 2084 struct inode *inode = mapping->host; 2085 + loff_t old_size = inode->i_size; 2092 2086 int i_size_changed = 0; 2093 2087 2094 2088 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); ··· 2109 2101 unlock_page(page); 2110 2102 page_cache_release(page); 2111 2103 2104 + if (old_size < pos) 2105 + pagecache_isize_extended(inode, old_size, pos); 2112 2106 /* 2113 2107 * Don't mark the inode dirty under page lock. First, it unnecessarily 2114 2108 * makes the holding time of page lock longer. Second, it forces lock

+9 -6

fs/ext4/balloc.c

··· 176 176 } 177 177 178 178 /* Initializes an uninitialized block bitmap */ 179 - static void ext4_init_block_bitmap(struct super_block *sb, 179 + static int ext4_init_block_bitmap(struct super_block *sb, 180 180 struct buffer_head *bh, 181 181 ext4_group_t block_group, 182 182 struct ext4_group_desc *gdp) ··· 192 192 /* If checksum is bad mark all blocks used to prevent allocation 193 193 * essentially implementing a per-group read-only flag. */ 194 194 if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { 195 - ext4_error(sb, "Checksum bad for group %u", block_group); 196 195 grp = ext4_get_group_info(sb, block_group); 197 196 if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) 198 197 percpu_counter_sub(&sbi->s_freeclusters_counter, ··· 204 205 count); 205 206 } 206 207 set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); 207 - return; 208 + return -EIO; 208 209 } 209 210 memset(bh->b_data, 0, sb->s_blocksize); 210 211 ··· 242 243 sb->s_blocksize * 8, bh->b_data); 243 244 ext4_block_bitmap_csum_set(sb, block_group, gdp, bh); 244 245 ext4_group_desc_csum_set(sb, block_group, gdp); 246 + return 0; 245 247 } 246 248 247 249 /* Return the number of free blocks in a block group. It is used when ··· 438 438 } 439 439 ext4_lock_group(sb, block_group); 440 440 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 441 - ext4_init_block_bitmap(sb, bh, block_group, desc); 441 + int err; 442 + 443 + err = ext4_init_block_bitmap(sb, bh, block_group, desc); 442 444 set_bitmap_uptodate(bh); 443 445 set_buffer_uptodate(bh); 444 446 ext4_unlock_group(sb, block_group); 445 447 unlock_buffer(bh); 448 + if (err) 449 + ext4_error(sb, "Checksum bad for grp %u", block_group); 446 450 return bh; 447 451 } 448 452 ext4_unlock_group(sb, block_group); ··· 640 636 * Account for the allocated meta blocks. We will never 641 637 * fail EDQUOT for metdata, but we do account for it. 642 638 */ 643 - if (!(*errp) && 644 - ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) { 639 + if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) { 645 640 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 646 641 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 647 642 dquot_alloc_block_nofail(inode,

+4 -8

fs/ext4/bitmap.c

··· 24 24 __u32 provided, calculated; 25 25 struct ext4_sb_info *sbi = EXT4_SB(sb); 26 26 27 - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 28 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 27 + if (!ext4_has_metadata_csum(sb)) 29 28 return 1; 30 29 31 30 provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); ··· 45 46 __u32 csum; 46 47 struct ext4_sb_info *sbi = EXT4_SB(sb); 47 48 48 - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 49 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 49 + if (!ext4_has_metadata_csum(sb)) 50 50 return; 51 51 52 52 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); ··· 63 65 struct ext4_sb_info *sbi = EXT4_SB(sb); 64 66 int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; 65 67 66 - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 67 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 68 + if (!ext4_has_metadata_csum(sb)) 68 69 return 1; 69 70 70 71 provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); ··· 88 91 __u32 csum; 89 92 struct ext4_sb_info *sbi = EXT4_SB(sb); 90 93 91 - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 92 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 94 + if (!ext4_has_metadata_csum(sb)) 93 95 return; 94 96 95 97 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);

+3 -5

fs/ext4/dir.c

··· 151 151 &file->f_ra, file, 152 152 index, 1); 153 153 file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 154 - bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err); 154 + bh = ext4_bread(NULL, inode, map.m_lblk, 0); 155 + if (IS_ERR(bh)) 156 + return PTR_ERR(bh); 155 157 } 156 158 157 - /* 158 - * We ignore I/O errors on directories so users have a chance 159 - * of recovering data when there's a bad sector 160 - */ 161 159 if (!bh) { 162 160 if (!dir_has_error) { 163 161 EXT4_ERROR_FILE(file, 0,

+31 -19

fs/ext4/ext4.h

··· 572 572 573 573 /* 574 574 * The bit position of these flags must not overlap with any of the 575 - * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(), 575 + * EXT4_GET_BLOCKS_*. They are used by ext4_find_extent(), 576 576 * read_extent_tree_block(), ext4_split_extent_at(), 577 577 * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). 578 578 * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be 579 579 * caching the extents when reading from the extent tree while a 580 580 * truncate or punch hole operation is in progress. 581 581 */ 582 - #define EXT4_EX_NOCACHE 0x0400 583 - #define EXT4_EX_FORCE_CACHE 0x0800 582 + #define EXT4_EX_NOCACHE 0x40000000 583 + #define EXT4_EX_FORCE_CACHE 0x20000000 584 584 585 585 /* 586 586 * Flags used by ext4_free_blocks ··· 890 890 struct ext4_es_tree i_es_tree; 891 891 rwlock_t i_es_lock; 892 892 struct list_head i_es_lru; 893 + unsigned int i_es_all_nr; /* protected by i_es_lock */ 893 894 unsigned int i_es_lru_nr; /* protected by i_es_lock */ 894 895 unsigned long i_touch_when; /* jiffies of last accessing */ 895 896 ··· 1175 1174 #define EXT4_MF_MNTDIR_SAMPLED 0x0001 1176 1175 #define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ 1177 1176 1177 + /* Number of quota types we support */ 1178 + #define EXT4_MAXQUOTAS 2 1179 + 1178 1180 /* 1179 1181 * fourth extended-fs super-block data in memory 1180 1182 */ ··· 1241 1237 u32 s_min_batch_time; 1242 1238 struct block_device *journal_bdev; 1243 1239 #ifdef CONFIG_QUOTA 1244 - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ 1240 + char *s_qf_names[EXT4_MAXQUOTAS]; /* Names of quota files with journalled quota */ 1245 1241 int s_jquota_fmt; /* Format of quota to use */ 1246 1242 #endif 1247 1243 unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ ··· 1334 1330 /* Reclaim extents from extent status tree */ 1335 1331 struct shrinker s_es_shrinker; 1336 1332 struct list_head s_es_lru; 1337 - unsigned long s_es_last_sorted; 1338 - struct percpu_counter s_extent_cache_cnt; 1333 + struct ext4_es_stats s_es_stats; 1339 1334 struct mb_cache *s_mb_cache; 1340 1335 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; 1341 1336 ··· 1402 1399 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ 1403 1400 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ 1404 1401 EXT4_STATE_NEWENTRY, /* File just added to dir */ 1405 - EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ 1406 1402 EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read 1407 1403 nolocking */ 1408 1404 EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ ··· 2088 2086 extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); 2089 2087 2090 2088 /* inode.c */ 2091 - struct buffer_head *ext4_getblk(handle_t *, struct inode *, 2092 - ext4_lblk_t, int, int *); 2093 - struct buffer_head *ext4_bread(handle_t *, struct inode *, 2094 - ext4_lblk_t, int, int *); 2089 + struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); 2090 + struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); 2095 2091 int ext4_get_block_write(struct inode *inode, sector_t iblock, 2096 2092 struct buffer_head *bh_result, int create); 2097 2093 int ext4_get_block(struct inode *inode, sector_t iblock, ··· 2109 2109 #define CONVERT_INLINE_DATA 2 2110 2110 2111 2111 extern struct inode *ext4_iget(struct super_block *, unsigned long); 2112 + extern struct inode *ext4_iget_normal(struct super_block *, unsigned long); 2112 2113 extern int ext4_write_inode(struct inode *, struct writeback_control *); 2113 2114 extern int ext4_setattr(struct dentry *, struct iattr *); 2114 2115 extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, ··· 2333 2332 static inline int ext4_has_group_desc_csum(struct super_block *sb) 2334 2333 { 2335 2334 return EXT4_HAS_RO_COMPAT_FEATURE(sb, 2336 - EXT4_FEATURE_RO_COMPAT_GDT_CSUM | 2337 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM); 2335 + EXT4_FEATURE_RO_COMPAT_GDT_CSUM) || 2336 + (EXT4_SB(sb)->s_chksum_driver != NULL); 2338 2337 } 2339 2338 2339 + static inline int ext4_has_metadata_csum(struct super_block *sb) 2340 + { 2341 + WARN_ON_ONCE(EXT4_HAS_RO_COMPAT_FEATURE(sb, 2342 + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && 2343 + !EXT4_SB(sb)->s_chksum_driver); 2344 + 2345 + return (EXT4_SB(sb)->s_chksum_driver != NULL); 2346 + } 2340 2347 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) 2341 2348 { 2342 2349 return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | ··· 2740 2731 struct ext4_extent *ex1, 2741 2732 struct ext4_extent *ex2); 2742 2733 extern int ext4_ext_insert_extent(handle_t *, struct inode *, 2743 - struct ext4_ext_path *, 2734 + struct ext4_ext_path **, 2744 2735 struct ext4_extent *, int); 2745 - extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, 2746 - struct ext4_ext_path *, 2747 - int flags); 2736 + extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, 2737 + struct ext4_ext_path **, 2738 + int flags); 2748 2739 extern void ext4_ext_drop_refs(struct ext4_ext_path *); 2749 2740 extern int ext4_ext_check_inode(struct inode *inode); 2750 2741 extern int ext4_find_delalloc_range(struct inode *inode, 2751 2742 ext4_lblk_t lblk_start, 2752 2743 ext4_lblk_t lblk_end); 2753 2744 extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); 2745 + extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); 2754 2746 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2755 2747 __u64 start, __u64 len); 2756 2748 extern int ext4_ext_precache(struct inode *inode); 2757 2749 extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); 2750 + extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, 2751 + struct inode *inode2, ext4_lblk_t lblk1, 2752 + ext4_lblk_t lblk2, ext4_lblk_t count, 2753 + int mark_unwritten,int *err); 2758 2754 2759 2755 /* move_extent.c */ 2760 2756 extern void ext4_double_down_write_data_sem(struct inode *first, ··· 2769 2755 extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, 2770 2756 __u64 start_orig, __u64 start_donor, 2771 2757 __u64 len, __u64 *moved_len); 2772 - extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, 2773 - struct ext4_extent **extent); 2774 2758 2775 2759 /* page-io.c */ 2776 2760 extern int __init ext4_init_pageio(void);

+1

fs/ext4/ext4_extents.h

··· 123 123 struct ext4_ext_path { 124 124 ext4_fsblk_t p_block; 125 125 __u16 p_depth; 126 + __u16 p_maxdepth; 126 127 struct ext4_extent *p_ext; 127 128 struct ext4_extent_idx *p_idx; 128 129 struct ext4_extent_header *p_hdr;

+2 -2

fs/ext4/ext4_jbd2.c

··· 256 256 set_buffer_prio(bh); 257 257 if (ext4_handle_valid(handle)) { 258 258 err = jbd2_journal_dirty_metadata(handle, bh); 259 - /* Errors can only happen if there is a bug */ 260 - if (WARN_ON_ONCE(err)) { 259 + /* Errors can only happen due to aborted journal or a nasty bug */ 260 + if (!is_handle_aborted(handle) && WARN_ON_ONCE(err)) { 261 261 ext4_journal_abort_handle(where, line, __func__, bh, 262 262 handle, err); 263 263 if (inode == NULL) {

+3 -3

fs/ext4/ext4_jbd2.h

··· 102 102 #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 103 103 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0 104 104 #endif 105 - #define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) 106 - #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) 107 - #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) 105 + #define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) 106 + #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) 107 + #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) 108 108 109 109 static inline int ext4_jbd2_credits_xattr(struct inode *inode) 110 110 {

+404 -217

fs/ext4/extents.c

··· 73 73 { 74 74 struct ext4_extent_tail *et; 75 75 76 - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 77 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 76 + if (!ext4_has_metadata_csum(inode->i_sb)) 78 77 return 1; 79 78 80 79 et = find_ext4_extent_tail(eh); ··· 87 88 { 88 89 struct ext4_extent_tail *et; 89 90 90 - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 91 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 91 + if (!ext4_has_metadata_csum(inode->i_sb)) 92 92 return; 93 93 94 94 et = find_ext4_extent_tail(eh); ··· 96 98 97 99 static int ext4_split_extent(handle_t *handle, 98 100 struct inode *inode, 99 - struct ext4_ext_path *path, 101 + struct ext4_ext_path **ppath, 100 102 struct ext4_map_blocks *map, 101 103 int split_flag, 102 104 int flags); 103 105 104 106 static int ext4_split_extent_at(handle_t *handle, 105 107 struct inode *inode, 106 - struct ext4_ext_path *path, 108 + struct ext4_ext_path **ppath, 107 109 ext4_lblk_t split, 108 110 int split_flag, 109 111 int flags); ··· 287 289 size = 4; 288 290 #endif 289 291 return size; 292 + } 293 + 294 + static inline int 295 + ext4_force_split_extent_at(handle_t *handle, struct inode *inode, 296 + struct ext4_ext_path **ppath, ext4_lblk_t lblk, 297 + int nofail) 298 + { 299 + struct ext4_ext_path *path = *ppath; 300 + int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); 301 + 302 + return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ? 303 + EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, 304 + EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | 305 + (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0)); 290 306 } 291 307 292 308 /* ··· 707 695 708 696 void ext4_ext_drop_refs(struct ext4_ext_path *path) 709 697 { 710 - int depth = path->p_depth; 711 - int i; 698 + int depth, i; 712 699 700 + if (!path) 701 + return; 702 + depth = path->p_depth; 713 703 for (i = 0; i <= depth; i++, path++) 714 704 if (path->p_bh) { 715 705 brelse(path->p_bh); ··· 855 841 } 856 842 857 843 struct ext4_ext_path * 858 - ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, 859 - struct ext4_ext_path *path, int flags) 844 + ext4_find_extent(struct inode *inode, ext4_lblk_t block, 845 + struct ext4_ext_path **orig_path, int flags) 860 846 { 861 847 struct ext4_extent_header *eh; 862 848 struct buffer_head *bh; 863 - short int depth, i, ppos = 0, alloc = 0; 849 + struct ext4_ext_path *path = orig_path ? *orig_path : NULL; 850 + short int depth, i, ppos = 0; 864 851 int ret; 865 852 866 853 eh = ext_inode_hdr(inode); 867 854 depth = ext_depth(inode); 868 855 869 - /* account possible depth increase */ 856 + if (path) { 857 + ext4_ext_drop_refs(path); 858 + if (depth > path[0].p_maxdepth) { 859 + kfree(path); 860 + *orig_path = path = NULL; 861 + } 862 + } 870 863 if (!path) { 864 + /* account possible depth increase */ 871 865 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), 872 866 GFP_NOFS); 873 - if (!path) 867 + if (unlikely(!path)) 874 868 return ERR_PTR(-ENOMEM); 875 - alloc = 1; 869 + path[0].p_maxdepth = depth + 1; 876 870 } 877 871 path[0].p_hdr = eh; 878 872 path[0].p_bh = NULL; ··· 898 876 899 877 bh = read_extent_tree_block(inode, path[ppos].p_block, --i, 900 878 flags); 901 - if (IS_ERR(bh)) { 879 + if (unlikely(IS_ERR(bh))) { 902 880 ret = PTR_ERR(bh); 903 881 goto err; 904 882 } ··· 932 910 933 911 err: 934 912 ext4_ext_drop_refs(path); 935 - if (alloc) 936 - kfree(path); 913 + kfree(path); 914 + if (orig_path) 915 + *orig_path = NULL; 937 916 return ERR_PTR(ret); 938 917 } 939 918 ··· 1261 1238 * just created block 1262 1239 */ 1263 1240 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, 1264 - unsigned int flags, 1265 - struct ext4_extent *newext) 1241 + unsigned int flags) 1266 1242 { 1267 1243 struct ext4_extent_header *neh; 1268 1244 struct buffer_head *bh; 1269 - ext4_fsblk_t newblock; 1245 + ext4_fsblk_t newblock, goal = 0; 1246 + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; 1270 1247 int err = 0; 1271 1248 1272 - newblock = ext4_ext_new_meta_block(handle, inode, NULL, 1273 - newext, &err, flags); 1249 + /* Try to prepend new index to old one */ 1250 + if (ext_depth(inode)) 1251 + goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode))); 1252 + if (goal > le32_to_cpu(es->s_first_data_block)) { 1253 + flags |= EXT4_MB_HINT_TRY_GOAL; 1254 + goal--; 1255 + } else 1256 + goal = ext4_inode_to_goal_block(inode); 1257 + newblock = ext4_new_meta_blocks(handle, inode, goal, flags, 1258 + NULL, &err); 1274 1259 if (newblock == 0) 1275 1260 return err; 1276 1261 ··· 1345 1314 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, 1346 1315 unsigned int mb_flags, 1347 1316 unsigned int gb_flags, 1348 - struct ext4_ext_path *path, 1317 + struct ext4_ext_path **ppath, 1349 1318 struct ext4_extent *newext) 1350 1319 { 1320 + struct ext4_ext_path *path = *ppath; 1351 1321 struct ext4_ext_path *curp; 1352 1322 int depth, i, err = 0; 1353 1323 ··· 1372 1340 goto out; 1373 1341 1374 1342 /* refill path */ 1375 - ext4_ext_drop_refs(path); 1376 - path = ext4_ext_find_extent(inode, 1343 + path = ext4_find_extent(inode, 1377 1344 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1378 - path, gb_flags); 1345 + ppath, gb_flags); 1379 1346 if (IS_ERR(path)) 1380 1347 err = PTR_ERR(path); 1381 1348 } else { 1382 1349 /* tree is full, time to grow in depth */ 1383 - err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext); 1350 + err = ext4_ext_grow_indepth(handle, inode, mb_flags); 1384 1351 if (err) 1385 1352 goto out; 1386 1353 1387 1354 /* refill path */ 1388 - ext4_ext_drop_refs(path); 1389 - path = ext4_ext_find_extent(inode, 1355 + path = ext4_find_extent(inode, 1390 1356 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1391 - path, gb_flags); 1357 + ppath, gb_flags); 1392 1358 if (IS_ERR(path)) { 1393 1359 err = PTR_ERR(path); 1394 1360 goto out; ··· 1589 1559 * allocated block. Thus, index entries have to be consistent 1590 1560 * with leaves. 1591 1561 */ 1592 - static ext4_lblk_t 1562 + ext4_lblk_t 1593 1563 ext4_ext_next_allocated_block(struct ext4_ext_path *path) 1594 1564 { 1595 1565 int depth; ··· 1832 1802 sizeof(struct ext4_extent_idx); 1833 1803 s += sizeof(struct ext4_extent_header); 1834 1804 1805 + path[1].p_maxdepth = path[0].p_maxdepth; 1835 1806 memcpy(path[0].p_hdr, path[1].p_hdr, s); 1836 1807 path[0].p_depth = 0; 1837 1808 path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + ··· 1927 1896 * creating new leaf in the no-space case. 1928 1897 */ 1929 1898 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, 1930 - struct ext4_ext_path *path, 1899 + struct ext4_ext_path **ppath, 1931 1900 struct ext4_extent *newext, int gb_flags) 1932 1901 { 1902 + struct ext4_ext_path *path = *ppath; 1933 1903 struct ext4_extent_header *eh; 1934 1904 struct ext4_extent *ex, *fex; 1935 1905 struct ext4_extent *nearex; /* nearest extent */ ··· 1939 1907 ext4_lblk_t next; 1940 1908 int mb_flags = 0, unwritten; 1941 1909 1910 + if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1911 + mb_flags |= EXT4_MB_DELALLOC_RESERVED; 1942 1912 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1943 1913 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); 1944 1914 return -EIO; ··· 1959 1925 /* 1960 1926 * Try to see whether we should rather test the extent on 1961 1927 * right from ex, or from the left of ex. This is because 1962 - * ext4_ext_find_extent() can return either extent on the 1928 + * ext4_find_extent() can return either extent on the 1963 1929 * left, or on the right from the searched position. This 1964 1930 * will make merging more effective. 1965 1931 */ ··· 2042 2008 if (next != EXT_MAX_BLOCKS) { 2043 2009 ext_debug("next leaf block - %u\n", next); 2044 2010 BUG_ON(npath != NULL); 2045 - npath = ext4_ext_find_extent(inode, next, NULL, 0); 2011 + npath = ext4_find_extent(inode, next, NULL, 0); 2046 2012 if (IS_ERR(npath)) 2047 2013 return PTR_ERR(npath); 2048 2014 BUG_ON(npath->p_depth != path->p_depth); ··· 2062 2028 * We're gonna add a new leaf in the tree. 2063 2029 */ 2064 2030 if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) 2065 - mb_flags = EXT4_MB_USE_RESERVED; 2031 + mb_flags |= EXT4_MB_USE_RESERVED; 2066 2032 err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, 2067 - path, newext); 2033 + ppath, newext); 2068 2034 if (err) 2069 2035 goto cleanup; 2070 2036 depth = ext_depth(inode); ··· 2142 2108 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 2143 2109 2144 2110 cleanup: 2145 - if (npath) { 2146 - ext4_ext_drop_refs(npath); 2147 - kfree(npath); 2148 - } 2111 + ext4_ext_drop_refs(npath); 2112 + kfree(npath); 2149 2113 return err; 2150 2114 } 2151 2115 ··· 2165 2133 /* find extent for this block */ 2166 2134 down_read(&EXT4_I(inode)->i_data_sem); 2167 2135 2168 - if (path && ext_depth(inode) != depth) { 2169 - /* depth was changed. we have to realloc path */ 2170 - kfree(path); 2171 - path = NULL; 2172 - } 2173 - 2174 - path = ext4_ext_find_extent(inode, block, path, 0); 2136 + path = ext4_find_extent(inode, block, &path, 0); 2175 2137 if (IS_ERR(path)) { 2176 2138 up_read(&EXT4_I(inode)->i_data_sem); 2177 2139 err = PTR_ERR(path); ··· 2182 2156 } 2183 2157 ex = path[depth].p_ext; 2184 2158 next = ext4_ext_next_allocated_block(path); 2185 - ext4_ext_drop_refs(path); 2186 2159 2187 2160 flags = 0; 2188 2161 exists = 0; ··· 2291 2266 block = es.es_lblk + es.es_len; 2292 2267 } 2293 2268 2294 - if (path) { 2295 - ext4_ext_drop_refs(path); 2296 - kfree(path); 2297 - } 2298 - 2269 + ext4_ext_drop_refs(path); 2270 + kfree(path); 2299 2271 return err; 2300 2272 } 2301 2273 ··· 2848 2826 ext4_lblk_t ee_block; 2849 2827 2850 2828 /* find extent for this block */ 2851 - path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); 2829 + path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); 2852 2830 if (IS_ERR(path)) { 2853 2831 ext4_journal_stop(handle); 2854 2832 return PTR_ERR(path); ··· 2876 2854 */ 2877 2855 if (end >= ee_block && 2878 2856 end < ee_block + ext4_ext_get_actual_len(ex) - 1) { 2879 - int split_flag = 0; 2880 - 2881 - if (ext4_ext_is_unwritten(ex)) 2882 - split_flag = EXT4_EXT_MARK_UNWRIT1 | 2883 - EXT4_EXT_MARK_UNWRIT2; 2884 - 2885 2857 /* 2886 2858 * Split the extent in two so that 'end' is the last 2887 2859 * block in the first new extent. Also we should not 2888 2860 * fail removing space due to ENOSPC so try to use 2889 2861 * reserved block if that happens. 2890 2862 */ 2891 - err = ext4_split_extent_at(handle, inode, path, 2892 - end + 1, split_flag, 2893 - EXT4_EX_NOCACHE | 2894 - EXT4_GET_BLOCKS_PRE_IO | 2895 - EXT4_GET_BLOCKS_METADATA_NOFAIL); 2896 - 2863 + err = ext4_force_split_extent_at(handle, inode, &path, 2864 + end + 1, 1); 2897 2865 if (err < 0) 2898 2866 goto out; 2899 2867 } ··· 2905 2893 ext4_journal_stop(handle); 2906 2894 return -ENOMEM; 2907 2895 } 2908 - path[0].p_depth = depth; 2896 + path[0].p_maxdepth = path[0].p_depth = depth; 2909 2897 path[0].p_hdr = ext_inode_hdr(inode); 2910 2898 i = 0; 2911 2899 ··· 3025 3013 out: 3026 3014 ext4_ext_drop_refs(path); 3027 3015 kfree(path); 3028 - if (err == -EAGAIN) { 3029 - path = NULL; 3016 + path = NULL; 3017 + if (err == -EAGAIN) 3030 3018 goto again; 3031 - } 3032 3019 ext4_journal_stop(handle); 3033 3020 3034 3021 return err; ··· 3141 3130 */ 3142 3131 static int ext4_split_extent_at(handle_t *handle, 3143 3132 struct inode *inode, 3144 - struct ext4_ext_path *path, 3133 + struct ext4_ext_path **ppath, 3145 3134 ext4_lblk_t split, 3146 3135 int split_flag, 3147 3136 int flags) 3148 3137 { 3138 + struct ext4_ext_path *path = *ppath; 3149 3139 ext4_fsblk_t newblock; 3150 3140 ext4_lblk_t ee_block; 3151 3141 struct ext4_extent *ex, newex, orig_ex, zero_ex; ··· 3217 3205 if (split_flag & EXT4_EXT_MARK_UNWRIT2) 3218 3206 ext4_ext_mark_unwritten(ex2); 3219 3207 3220 - err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3208 + err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); 3221 3209 if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { 3222 3210 if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { 3223 3211 if (split_flag & EXT4_EXT_DATA_VALID1) { ··· 3283 3271 */ 3284 3272 static int ext4_split_extent(handle_t *handle, 3285 3273 struct inode *inode, 3286 - struct ext4_ext_path *path, 3274 + struct ext4_ext_path **ppath, 3287 3275 struct ext4_map_blocks *map, 3288 3276 int split_flag, 3289 3277 int flags) 3290 3278 { 3279 + struct ext4_ext_path *path = *ppath; 3291 3280 ext4_lblk_t ee_block; 3292 3281 struct ext4_extent *ex; 3293 3282 unsigned int ee_len, depth; ··· 3311 3298 EXT4_EXT_MARK_UNWRIT2; 3312 3299 if (split_flag & EXT4_EXT_DATA_VALID2) 3313 3300 split_flag1 |= EXT4_EXT_DATA_VALID1; 3314 - err = ext4_split_extent_at(handle, inode, path, 3301 + err = ext4_split_extent_at(handle, inode, ppath, 3315 3302 map->m_lblk + map->m_len, split_flag1, flags1); 3316 3303 if (err) 3317 3304 goto out; ··· 3322 3309 * Update path is required because previous ext4_split_extent_at() may 3323 3310 * result in split of original leaf or extent zeroout. 3324 3311 */ 3325 - ext4_ext_drop_refs(path); 3326 - path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); 3312 + path = ext4_find_extent(inode, map->m_lblk, ppath, 0); 3327 3313 if (IS_ERR(path)) 3328 3314 return PTR_ERR(path); 3329 3315 depth = ext_depth(inode); ··· 3342 3330 split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | 3343 3331 EXT4_EXT_MARK_UNWRIT2); 3344 3332 } 3345 - err = ext4_split_extent_at(handle, inode, path, 3333 + err = ext4_split_extent_at(handle, inode, ppath, 3346 3334 map->m_lblk, split_flag1, flags); 3347 3335 if (err) 3348 3336 goto out; ··· 3376 3364 static int ext4_ext_convert_to_initialized(handle_t *handle, 3377 3365 struct inode *inode, 3378 3366 struct ext4_map_blocks *map, 3379 - struct ext4_ext_path *path, 3367 + struct ext4_ext_path **ppath, 3380 3368 int flags) 3381 3369 { 3370 + struct ext4_ext_path *path = *ppath; 3382 3371 struct ext4_sb_info *sbi; 3383 3372 struct ext4_extent_header *eh; 3384 3373 struct ext4_map_blocks split_map; ··· 3603 3590 } 3604 3591 } 3605 3592 3606 - allocated = ext4_split_extent(handle, inode, path, 3593 + allocated = ext4_split_extent(handle, inode, ppath, 3607 3594 &split_map, split_flag, flags); 3608 3595 if (allocated < 0) 3609 3596 err = allocated; ··· 3642 3629 static int ext4_split_convert_extents(handle_t *handle, 3643 3630 struct inode *inode, 3644 3631 struct ext4_map_blocks *map, 3645 - struct ext4_ext_path *path, 3632 + struct ext4_ext_path **ppath, 3646 3633 int flags) 3647 3634 { 3635 + struct ext4_ext_path *path = *ppath; 3648 3636 ext4_lblk_t eof_block; 3649 3637 ext4_lblk_t ee_block; 3650 3638 struct ext4_extent *ex; ··· 3679 3665 split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); 3680 3666 } 3681 3667 flags |= EXT4_GET_BLOCKS_PRE_IO; 3682 - return ext4_split_extent(handle, inode, path, map, split_flag, flags); 3668 + return ext4_split_extent(handle, inode, ppath, map, split_flag, flags); 3683 3669 } 3684 - 3685 - static int ext4_convert_initialized_extents(handle_t *handle, 3686 - struct inode *inode, 3687 - struct ext4_map_blocks *map, 3688 - struct ext4_ext_path *path) 3689 - { 3690 - struct ext4_extent *ex; 3691 - ext4_lblk_t ee_block; 3692 - unsigned int ee_len; 3693 - int depth; 3694 - int err = 0; 3695 - 3696 - depth = ext_depth(inode); 3697 - ex = path[depth].p_ext; 3698 - ee_block = le32_to_cpu(ex->ee_block); 3699 - ee_len = ext4_ext_get_actual_len(ex); 3700 - 3701 - ext_debug("%s: inode %lu, logical" 3702 - "block %llu, max_blocks %u\n", __func__, inode->i_ino, 3703 - (unsigned long long)ee_block, ee_len); 3704 - 3705 - if (ee_block != map->m_lblk || ee_len > map->m_len) { 3706 - err = ext4_split_convert_extents(handle, inode, map, path, 3707 - EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); 3708 - if (err < 0) 3709 - goto out; 3710 - ext4_ext_drop_refs(path); 3711 - path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); 3712 - if (IS_ERR(path)) { 3713 - err = PTR_ERR(path); 3714 - goto out; 3715 - } 3716 - depth = ext_depth(inode); 3717 - ex = path[depth].p_ext; 3718 - if (!ex) { 3719 - EXT4_ERROR_INODE(inode, "unexpected hole at %lu", 3720 - (unsigned long) map->m_lblk); 3721 - err = -EIO; 3722 - goto out; 3723 - } 3724 - } 3725 - 3726 - err = ext4_ext_get_access(handle, inode, path + depth); 3727 - if (err) 3728 - goto out; 3729 - /* first mark the extent as unwritten */ 3730 - ext4_ext_mark_unwritten(ex); 3731 - 3732 - /* note: ext4_ext_correct_indexes() isn't needed here because 3733 - * borders are not changed 3734 - */ 3735 - ext4_ext_try_to_merge(handle, inode, path, ex); 3736 - 3737 - /* Mark modified extent as dirty */ 3738 - err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3739 - out: 3740 - ext4_ext_show_leaf(inode, path); 3741 - return err; 3742 - } 3743 - 3744 3670 3745 3671 static int ext4_convert_unwritten_extents_endio(handle_t *handle, 3746 3672 struct inode *inode, 3747 3673 struct ext4_map_blocks *map, 3748 - struct ext4_ext_path *path) 3674 + struct ext4_ext_path **ppath) 3749 3675 { 3676 + struct ext4_ext_path *path = *ppath; 3750 3677 struct ext4_extent *ex; 3751 3678 ext4_lblk_t ee_block; 3752 3679 unsigned int ee_len; ··· 3716 3761 inode->i_ino, (unsigned long long)ee_block, ee_len, 3717 3762 (unsigned long long)map->m_lblk, map->m_len); 3718 3763 #endif 3719 - err = ext4_split_convert_extents(handle, inode, map, path, 3764 + err = ext4_split_convert_extents(handle, inode, map, ppath, 3720 3765 EXT4_GET_BLOCKS_CONVERT); 3721 3766 if (err < 0) 3722 - goto out; 3723 - ext4_ext_drop_refs(path); 3724 - path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); 3725 - if (IS_ERR(path)) { 3726 - err = PTR_ERR(path); 3727 - goto out; 3728 - } 3767 + return err; 3768 + path = ext4_find_extent(inode, map->m_lblk, ppath, 0); 3769 + if (IS_ERR(path)) 3770 + return PTR_ERR(path); 3729 3771 depth = ext_depth(inode); 3730 3772 ex = path[depth].p_ext; 3731 3773 } ··· 3915 3963 } 3916 3964 3917 3965 static int 3918 - ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode, 3919 - struct ext4_map_blocks *map, 3920 - struct ext4_ext_path *path, int flags, 3921 - unsigned int allocated, ext4_fsblk_t newblock) 3966 + convert_initialized_extent(handle_t *handle, struct inode *inode, 3967 + struct ext4_map_blocks *map, 3968 + struct ext4_ext_path **ppath, int flags, 3969 + unsigned int allocated, ext4_fsblk_t newblock) 3922 3970 { 3923 - int ret = 0; 3971 + struct ext4_ext_path *path = *ppath; 3972 + struct ext4_extent *ex; 3973 + ext4_lblk_t ee_block; 3974 + unsigned int ee_len; 3975 + int depth; 3924 3976 int err = 0; 3925 3977 3926 3978 /* ··· 3934 3978 if (map->m_len > EXT_UNWRITTEN_MAX_LEN) 3935 3979 map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; 3936 3980 3937 - ret = ext4_convert_initialized_extents(handle, inode, map, 3938 - path); 3939 - if (ret >= 0) { 3940 - ext4_update_inode_fsync_trans(handle, inode, 1); 3941 - err = check_eofblocks_fl(handle, inode, map->m_lblk, 3942 - path, map->m_len); 3943 - } else 3944 - err = ret; 3981 + depth = ext_depth(inode); 3982 + ex = path[depth].p_ext; 3983 + ee_block = le32_to_cpu(ex->ee_block); 3984 + ee_len = ext4_ext_get_actual_len(ex); 3985 + 3986 + ext_debug("%s: inode %lu, logical" 3987 + "block %llu, max_blocks %u\n", __func__, inode->i_ino, 3988 + (unsigned long long)ee_block, ee_len); 3989 + 3990 + if (ee_block != map->m_lblk || ee_len > map->m_len) { 3991 + err = ext4_split_convert_extents(handle, inode, map, ppath, 3992 + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); 3993 + if (err < 0) 3994 + return err; 3995 + path = ext4_find_extent(inode, map->m_lblk, ppath, 0); 3996 + if (IS_ERR(path)) 3997 + return PTR_ERR(path); 3998 + depth = ext_depth(inode); 3999 + ex = path[depth].p_ext; 4000 + if (!ex) { 4001 + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", 4002 + (unsigned long) map->m_lblk); 4003 + return -EIO; 4004 + } 4005 + } 4006 + 4007 + err = ext4_ext_get_access(handle, inode, path + depth); 4008 + if (err) 4009 + return err; 4010 + /* first mark the extent as unwritten */ 4011 + ext4_ext_mark_unwritten(ex); 4012 + 4013 + /* note: ext4_ext_correct_indexes() isn't needed here because 4014 + * borders are not changed 4015 + */ 4016 + ext4_ext_try_to_merge(handle, inode, path, ex); 4017 + 4018 + /* Mark modified extent as dirty */ 4019 + err = ext4_ext_dirty(handle, inode, path + path->p_depth); 4020 + if (err) 4021 + return err; 4022 + ext4_ext_show_leaf(inode, path); 4023 + 4024 + ext4_update_inode_fsync_trans(handle, inode, 1); 4025 + err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len); 4026 + if (err) 4027 + return err; 3945 4028 map->m_flags |= EXT4_MAP_UNWRITTEN; 3946 4029 if (allocated > map->m_len) 3947 4030 allocated = map->m_len; 3948 4031 map->m_len = allocated; 3949 - 3950 - return err ? err : allocated; 4032 + return allocated; 3951 4033 } 3952 4034 3953 4035 static int 3954 4036 ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, 3955 4037 struct ext4_map_blocks *map, 3956 - struct ext4_ext_path *path, int flags, 4038 + struct ext4_ext_path **ppath, int flags, 3957 4039 unsigned int allocated, ext4_fsblk_t newblock) 3958 4040 { 4041 + struct ext4_ext_path *path = *ppath; 3959 4042 int ret = 0; 3960 4043 int err = 0; 3961 4044 ext4_io_end_t *io = ext4_inode_aio(inode); ··· 4016 4021 4017 4022 /* get_block() before submit the IO, split the extent */ 4018 4023 if (flags & EXT4_GET_BLOCKS_PRE_IO) { 4019 - ret = ext4_split_convert_extents(handle, inode, map, 4020 - path, flags | EXT4_GET_BLOCKS_CONVERT); 4024 + ret = ext4_split_convert_extents(handle, inode, map, ppath, 4025 + flags | EXT4_GET_BLOCKS_CONVERT); 4021 4026 if (ret <= 0) 4022 4027 goto out; 4023 4028 /* ··· 4035 4040 /* IO end_io complete, convert the filled extent to written */ 4036 4041 if (flags & EXT4_GET_BLOCKS_CONVERT) { 4037 4042 ret = ext4_convert_unwritten_extents_endio(handle, inode, map, 4038 - path); 4043 + ppath); 4039 4044 if (ret >= 0) { 4040 4045 ext4_update_inode_fsync_trans(handle, inode, 1); 4041 4046 err = check_eofblocks_fl(handle, inode, map->m_lblk, ··· 4073 4078 } 4074 4079 4075 4080 /* buffered write, writepage time, convert*/ 4076 - ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags); 4081 + ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); 4077 4082 if (ret >= 0) 4078 4083 ext4_update_inode_fsync_trans(handle, inode, 1); 4079 4084 out: ··· 4274 4279 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 4275 4280 4276 4281 /* find extent for this block */ 4277 - path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0); 4282 + path = ext4_find_extent(inode, map->m_lblk, NULL, 0); 4278 4283 if (IS_ERR(path)) { 4279 4284 err = PTR_ERR(path); 4280 4285 path = NULL; ··· 4286 4291 /* 4287 4292 * consistent leaf must not be empty; 4288 4293 * this situation is possible, though, _during_ tree modification; 4289 - * this is why assert can't be put in ext4_ext_find_extent() 4294 + * this is why assert can't be put in ext4_find_extent() 4290 4295 */ 4291 4296 if (unlikely(path[depth].p_ext == NULL && depth != 0)) { 4292 4297 EXT4_ERROR_INODE(inode, "bad extent address " ··· 4326 4331 */ 4327 4332 if ((!ext4_ext_is_unwritten(ex)) && 4328 4333 (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { 4329 - allocated = ext4_ext_convert_initialized_extent( 4330 - handle, inode, map, path, flags, 4331 - allocated, newblock); 4334 + allocated = convert_initialized_extent( 4335 + handle, inode, map, &path, 4336 + flags, allocated, newblock); 4332 4337 goto out2; 4333 4338 } else if (!ext4_ext_is_unwritten(ex)) 4334 4339 goto out; 4335 4340 4336 4341 ret = ext4_ext_handle_unwritten_extents( 4337 - handle, inode, map, path, flags, 4342 + handle, inode, map, &path, flags, 4338 4343 allocated, newblock); 4339 4344 if (ret < 0) 4340 4345 err = ret; ··· 4371 4376 4372 4377 /* 4373 4378 * If we are doing bigalloc, check to see if the extent returned 4374 - * by ext4_ext_find_extent() implies a cluster we can use. 4379 + * by ext4_find_extent() implies a cluster we can use. 4375 4380 */ 4376 4381 if (cluster_offset && ex && 4377 4382 get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { ··· 4446 4451 ar.flags = 0; 4447 4452 if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) 4448 4453 ar.flags |= EXT4_MB_HINT_NOPREALLOC; 4454 + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 4455 + ar.flags |= EXT4_MB_DELALLOC_RESERVED; 4449 4456 newblock = ext4_mb_new_blocks(handle, &ar, &err); 4450 4457 if (!newblock) 4451 4458 goto out2; ··· 4483 4486 err = check_eofblocks_fl(handle, inode, map->m_lblk, 4484 4487 path, ar.len); 4485 4488 if (!err) 4486 - err = ext4_ext_insert_extent(handle, inode, path, 4489 + err = ext4_ext_insert_extent(handle, inode, &path, 4487 4490 &newex, flags); 4488 4491 4489 4492 if (!err && set_unwritten) { ··· 4616 4619 map->m_pblk = newblock; 4617 4620 map->m_len = allocated; 4618 4621 out2: 4619 - if (path) { 4620 - ext4_ext_drop_refs(path); 4621 - kfree(path); 4622 - } 4622 + ext4_ext_drop_refs(path); 4623 + kfree(path); 4623 4624 4624 4625 trace_ext4_ext_map_blocks_exit(inode, flags, map, 4625 4626 err ? err : allocated); ··· 4794 4799 max_blocks -= lblk; 4795 4800 4796 4801 flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT | 4797 - EXT4_GET_BLOCKS_CONVERT_UNWRITTEN; 4802 + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | 4803 + EXT4_EX_NOCACHE; 4798 4804 if (mode & FALLOC_FL_KEEP_SIZE) 4799 4805 flags |= EXT4_GET_BLOCKS_KEEP_SIZE; 4800 4806 ··· 4833 4837 ext4_inode_block_unlocked_dio(inode); 4834 4838 inode_dio_wait(inode); 4835 4839 4836 - /* 4837 - * Remove entire range from the extent status tree. 4838 - */ 4839 - ret = ext4_es_remove_extent(inode, lblk, max_blocks); 4840 - if (ret) 4841 - goto out_dio; 4842 - 4843 4840 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, 4844 4841 flags, mode); 4842 + if (ret) 4843 + goto out_dio; 4844 + /* 4845 + * Remove entire range from the extent status tree. 4846 + * 4847 + * ext4_es_remove_extent(inode, lblk, max_blocks) is 4848 + * NOT sufficient. I'm not sure why this is the case, 4849 + * but let's be conservative and remove the extent 4850 + * status tree for the entire inode. There should be 4851 + * no outstanding delalloc extents thanks to the 4852 + * filemap_write_and_wait_range() call above. 4853 + */ 4854 + ret = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); 4845 4855 if (ret) 4846 4856 goto out_dio; 4847 4857 } ··· 5306 5304 struct ext4_ext_path *path; 5307 5305 int ret = 0, depth; 5308 5306 struct ext4_extent *extent; 5309 - ext4_lblk_t stop_block, current_block; 5307 + ext4_lblk_t stop_block; 5310 5308 ext4_lblk_t ex_start, ex_end; 5311 5309 5312 5310 /* Let path point to the last extent */ 5313 - path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); 5311 + path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); 5314 5312 if (IS_ERR(path)) 5315 5313 return PTR_ERR(path); 5316 5314 5317 5315 depth = path->p_depth; 5318 5316 extent = path[depth].p_ext; 5319 - if (!extent) { 5320 - ext4_ext_drop_refs(path); 5321 - kfree(path); 5322 - return ret; 5323 - } 5317 + if (!extent) 5318 + goto out; 5324 5319 5325 5320 stop_block = le32_to_cpu(extent->ee_block) + 5326 5321 ext4_ext_get_actual_len(extent); 5327 - ext4_ext_drop_refs(path); 5328 - kfree(path); 5329 5322 5330 5323 /* Nothing to shift, if hole is at the end of file */ 5331 5324 if (start >= stop_block) 5332 - return ret; 5325 + goto out; 5333 5326 5334 5327 /* 5335 5328 * Don't start shifting extents until we make sure the hole is big 5336 5329 * enough to accomodate the shift. 5337 5330 */ 5338 - path = ext4_ext_find_extent(inode, start - 1, NULL, 0); 5331 + path = ext4_find_extent(inode, start - 1, &path, 0); 5339 5332 if (IS_ERR(path)) 5340 5333 return PTR_ERR(path); 5341 5334 depth = path->p_depth; ··· 5343 5346 ex_start = 0; 5344 5347 ex_end = 0; 5345 5348 } 5346 - ext4_ext_drop_refs(path); 5347 - kfree(path); 5348 5349 5349 5350 if ((start == ex_start && shift > ex_start) || 5350 5351 (shift > start - ex_end)) ··· 5350 5355 5351 5356 /* Its safe to start updating extents */ 5352 5357 while (start < stop_block) { 5353 - path = ext4_ext_find_extent(inode, start, NULL, 0); 5358 + path = ext4_find_extent(inode, start, &path, 0); 5354 5359 if (IS_ERR(path)) 5355 5360 return PTR_ERR(path); 5356 5361 depth = path->p_depth; ··· 5360 5365 (unsigned long) start); 5361 5366 return -EIO; 5362 5367 } 5363 - 5364 - current_block = le32_to_cpu(extent->ee_block); 5365 - if (start > current_block) { 5368 + if (start > le32_to_cpu(extent->ee_block)) { 5366 5369 /* Hole, move to the next extent */ 5367 - ret = mext_next_extent(inode, path, &extent); 5368 - if (ret != 0) { 5369 - ext4_ext_drop_refs(path); 5370 - kfree(path); 5371 - if (ret == 1) 5372 - ret = 0; 5373 - break; 5370 + if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) { 5371 + path[depth].p_ext++; 5372 + } else { 5373 + start = ext4_ext_next_allocated_block(path); 5374 + continue; 5374 5375 } 5375 5376 } 5376 5377 ret = ext4_ext_shift_path_extents(path, shift, inode, 5377 5378 handle, &start); 5378 - ext4_ext_drop_refs(path); 5379 - kfree(path); 5380 5379 if (ret) 5381 5380 break; 5382 5381 } 5383 - 5382 + out: 5383 + ext4_ext_drop_refs(path); 5384 + kfree(path); 5384 5385 return ret; 5385 5386 } 5386 5387 ··· 5498 5507 out_mutex: 5499 5508 mutex_unlock(&inode->i_mutex); 5500 5509 return ret; 5510 + } 5511 + 5512 + /** 5513 + * ext4_swap_extents - Swap extents between two inodes 5514 + * 5515 + * @inode1: First inode 5516 + * @inode2: Second inode 5517 + * @lblk1: Start block for first inode 5518 + * @lblk2: Start block for second inode 5519 + * @count: Number of blocks to swap 5520 + * @mark_unwritten: Mark second inode's extents as unwritten after swap 5521 + * @erp: Pointer to save error value 5522 + * 5523 + * This helper routine does exactly what is promise "swap extents". All other 5524 + * stuff such as page-cache locking consistency, bh mapping consistency or 5525 + * extent's data copying must be performed by caller. 5526 + * Locking: 5527 + * i_mutex is held for both inodes 5528 + * i_data_sem is locked for write for both inodes 5529 + * Assumptions: 5530 + * All pages from requested range are locked for both inodes 5531 + */ 5532 + int 5533 + ext4_swap_extents(handle_t *handle, struct inode *inode1, 5534 + struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, 5535 + ext4_lblk_t count, int unwritten, int *erp) 5536 + { 5537 + struct ext4_ext_path *path1 = NULL; 5538 + struct ext4_ext_path *path2 = NULL; 5539 + int replaced_count = 0; 5540 + 5541 + BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); 5542 + BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); 5543 + BUG_ON(!mutex_is_locked(&inode1->i_mutex)); 5544 + BUG_ON(!mutex_is_locked(&inode1->i_mutex)); 5545 + 5546 + *erp = ext4_es_remove_extent(inode1, lblk1, count); 5547 + if (unlikely(*erp)) 5548 + return 0; 5549 + *erp = ext4_es_remove_extent(inode2, lblk2, count); 5550 + if (unlikely(*erp)) 5551 + return 0; 5552 + 5553 + while (count) { 5554 + struct ext4_extent *ex1, *ex2, tmp_ex; 5555 + ext4_lblk_t e1_blk, e2_blk; 5556 + int e1_len, e2_len, len; 5557 + int split = 0; 5558 + 5559 + path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE); 5560 + if (unlikely(IS_ERR(path1))) { 5561 + *erp = PTR_ERR(path1); 5562 + path1 = NULL; 5563 + finish: 5564 + count = 0; 5565 + goto repeat; 5566 + } 5567 + path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE); 5568 + if (unlikely(IS_ERR(path2))) { 5569 + *erp = PTR_ERR(path2); 5570 + path2 = NULL; 5571 + goto finish; 5572 + } 5573 + ex1 = path1[path1->p_depth].p_ext; 5574 + ex2 = path2[path2->p_depth].p_ext; 5575 + /* Do we have somthing to swap ? */ 5576 + if (unlikely(!ex2 || !ex1)) 5577 + goto finish; 5578 + 5579 + e1_blk = le32_to_cpu(ex1->ee_block); 5580 + e2_blk = le32_to_cpu(ex2->ee_block); 5581 + e1_len = ext4_ext_get_actual_len(ex1); 5582 + e2_len = ext4_ext_get_actual_len(ex2); 5583 + 5584 + /* Hole handling */ 5585 + if (!in_range(lblk1, e1_blk, e1_len) || 5586 + !in_range(lblk2, e2_blk, e2_len)) { 5587 + ext4_lblk_t next1, next2; 5588 + 5589 + /* if hole after extent, then go to next extent */ 5590 + next1 = ext4_ext_next_allocated_block(path1); 5591 + next2 = ext4_ext_next_allocated_block(path2); 5592 + /* If hole before extent, then shift to that extent */ 5593 + if (e1_blk > lblk1) 5594 + next1 = e1_blk; 5595 + if (e2_blk > lblk2) 5596 + next2 = e1_blk; 5597 + /* Do we have something to swap */ 5598 + if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) 5599 + goto finish; 5600 + /* Move to the rightest boundary */ 5601 + len = next1 - lblk1; 5602 + if (len < next2 - lblk2) 5603 + len = next2 - lblk2; 5604 + if (len > count) 5605 + len = count; 5606 + lblk1 += len; 5607 + lblk2 += len; 5608 + count -= len; 5609 + goto repeat; 5610 + } 5611 + 5612 + /* Prepare left boundary */ 5613 + if (e1_blk < lblk1) { 5614 + split = 1; 5615 + *erp = ext4_force_split_extent_at(handle, inode1, 5616 + &path1, lblk1, 0); 5617 + if (unlikely(*erp)) 5618 + goto finish; 5619 + } 5620 + if (e2_blk < lblk2) { 5621 + split = 1; 5622 + *erp = ext4_force_split_extent_at(handle, inode2, 5623 + &path2, lblk2, 0); 5624 + if (unlikely(*erp)) 5625 + goto finish; 5626 + } 5627 + /* ext4_split_extent_at() may result in leaf extent split, 5628 + * path must to be revalidated. */ 5629 + if (split) 5630 + goto repeat; 5631 + 5632 + /* Prepare right boundary */ 5633 + len = count; 5634 + if (len > e1_blk + e1_len - lblk1) 5635 + len = e1_blk + e1_len - lblk1; 5636 + if (len > e2_blk + e2_len - lblk2) 5637 + len = e2_blk + e2_len - lblk2; 5638 + 5639 + if (len != e1_len) { 5640 + split = 1; 5641 + *erp = ext4_force_split_extent_at(handle, inode1, 5642 + &path1, lblk1 + len, 0); 5643 + if (unlikely(*erp)) 5644 + goto finish; 5645 + } 5646 + if (len != e2_len) { 5647 + split = 1; 5648 + *erp = ext4_force_split_extent_at(handle, inode2, 5649 + &path2, lblk2 + len, 0); 5650 + if (*erp) 5651 + goto finish; 5652 + } 5653 + /* ext4_split_extent_at() may result in leaf extent split, 5654 + * path must to be revalidated. */ 5655 + if (split) 5656 + goto repeat; 5657 + 5658 + BUG_ON(e2_len != e1_len); 5659 + *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth); 5660 + if (unlikely(*erp)) 5661 + goto finish; 5662 + *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth); 5663 + if (unlikely(*erp)) 5664 + goto finish; 5665 + 5666 + /* Both extents are fully inside boundaries. Swap it now */ 5667 + tmp_ex = *ex1; 5668 + ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2)); 5669 + ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex)); 5670 + ex1->ee_len = cpu_to_le16(e2_len); 5671 + ex2->ee_len = cpu_to_le16(e1_len); 5672 + if (unwritten) 5673 + ext4_ext_mark_unwritten(ex2); 5674 + if (ext4_ext_is_unwritten(&tmp_ex)) 5675 + ext4_ext_mark_unwritten(ex1); 5676 + 5677 + ext4_ext_try_to_merge(handle, inode2, path2, ex2); 5678 + ext4_ext_try_to_merge(handle, inode1, path1, ex1); 5679 + *erp = ext4_ext_dirty(handle, inode2, path2 + 5680 + path2->p_depth); 5681 + if (unlikely(*erp)) 5682 + goto finish; 5683 + *erp = ext4_ext_dirty(handle, inode1, path1 + 5684 + path1->p_depth); 5685 + /* 5686 + * Looks scarry ah..? second inode already points to new blocks, 5687 + * and it was successfully dirtied. But luckily error may happen 5688 + * only due to journal error, so full transaction will be 5689 + * aborted anyway. 5690 + */ 5691 + if (unlikely(*erp)) 5692 + goto finish; 5693 + lblk1 += len; 5694 + lblk2 += len; 5695 + replaced_count += len; 5696 + count -= len; 5697 + 5698 + repeat: 5699 + ext4_ext_drop_refs(path1); 5700 + kfree(path1); 5701 + ext4_ext_drop_refs(path2); 5702 + kfree(path2); 5703 + path1 = path2 = NULL; 5704 + } 5705 + return replaced_count; 5501 5706 }

+182 -18

fs/ext4/extents_status.c

··· 11 11 */ 12 12 #include <linux/rbtree.h> 13 13 #include <linux/list_sort.h> 14 + #include <linux/proc_fs.h> 15 + #include <linux/seq_file.h> 14 16 #include "ext4.h" 15 17 #include "extents_status.h" 16 18 ··· 315 313 */ 316 314 if (!ext4_es_is_delayed(es)) { 317 315 EXT4_I(inode)->i_es_lru_nr++; 318 - percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); 316 + percpu_counter_inc(&EXT4_SB(inode->i_sb)-> 317 + s_es_stats.es_stats_lru_cnt); 319 318 } 319 + 320 + EXT4_I(inode)->i_es_all_nr++; 321 + percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); 320 322 321 323 return es; 322 324 } 323 325 324 326 static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) 325 327 { 328 + EXT4_I(inode)->i_es_all_nr--; 329 + percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); 330 + 326 331 /* Decrease the lru counter when this es is not delayed */ 327 332 if (!ext4_es_is_delayed(es)) { 328 333 BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); 329 334 EXT4_I(inode)->i_es_lru_nr--; 330 - percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); 335 + percpu_counter_dec(&EXT4_SB(inode->i_sb)-> 336 + s_es_stats.es_stats_lru_cnt); 331 337 } 332 338 333 339 kmem_cache_free(ext4_es_cachep, es); ··· 436 426 unsigned short ee_len; 437 427 int depth, ee_status, es_status; 438 428 439 - path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE); 429 + path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE); 440 430 if (IS_ERR(path)) 441 431 return; 442 432 ··· 509 499 } 510 500 } 511 501 out: 512 - if (path) { 513 - ext4_ext_drop_refs(path); 514 - kfree(path); 515 - } 502 + ext4_ext_drop_refs(path); 503 + kfree(path); 516 504 } 517 505 518 506 static void ext4_es_insert_extent_ind_check(struct inode *inode, ··· 739 731 struct extent_status *es) 740 732 { 741 733 struct ext4_es_tree *tree; 734 + struct ext4_es_stats *stats; 742 735 struct extent_status *es1 = NULL; 743 736 struct rb_node *node; 744 737 int found = 0; ··· 776 767 } 777 768 778 769 out: 770 + stats = &EXT4_SB(inode->i_sb)->s_es_stats; 779 771 if (found) { 780 772 BUG_ON(!es1); 781 773 es->es_lblk = es1->es_lblk; 782 774 es->es_len = es1->es_len; 783 775 es->es_pblk = es1->es_pblk; 776 + stats->es_stats_cache_hits++; 777 + } else { 778 + stats->es_stats_cache_misses++; 784 779 } 785 780 786 781 read_unlock(&EXT4_I(inode)->i_es_lock); ··· 946 933 struct ext4_inode_info *locked_ei) 947 934 { 948 935 struct ext4_inode_info *ei; 936 + struct ext4_es_stats *es_stats; 949 937 struct list_head *cur, *tmp; 950 938 LIST_HEAD(skipped); 939 + ktime_t start_time; 940 + u64 scan_time; 951 941 int nr_shrunk = 0; 952 942 int retried = 0, skip_precached = 1, nr_skipped = 0; 953 943 944 + es_stats = &sbi->s_es_stats; 945 + start_time = ktime_get(); 954 946 spin_lock(&sbi->s_es_lru_lock); 955 947 956 948 retry: ··· 966 948 * If we have already reclaimed all extents from extent 967 949 * status tree, just stop the loop immediately. 968 950 */ 969 - if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0) 951 + if (percpu_counter_read_positive( 952 + &es_stats->es_stats_lru_cnt) == 0) 970 953 break; 971 954 972 955 ei = list_entry(cur, struct ext4_inode_info, i_es_lru); ··· 977 958 * time. Normally we try hard to avoid shrinking 978 959 * precached inodes, but we will as a last resort. 979 960 */ 980 - if ((sbi->s_es_last_sorted < ei->i_touch_when) || 961 + if ((es_stats->es_stats_last_sorted < ei->i_touch_when) || 981 962 (skip_precached && ext4_test_inode_state(&ei->vfs_inode, 982 963 EXT4_STATE_EXT_PRECACHED))) { 983 964 nr_skipped++; ··· 1011 992 if ((nr_shrunk == 0) && nr_skipped && !retried) { 1012 993 retried++; 1013 994 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); 1014 - sbi->s_es_last_sorted = jiffies; 995 + es_stats->es_stats_last_sorted = jiffies; 1015 996 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, 1016 997 i_es_lru); 1017 998 /* ··· 1029 1010 if (locked_ei && nr_shrunk == 0) 1030 1011 nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); 1031 1012 1013 + scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1014 + if (likely(es_stats->es_stats_scan_time)) 1015 + es_stats->es_stats_scan_time = (scan_time + 1016 + es_stats->es_stats_scan_time*3) / 4; 1017 + else 1018 + es_stats->es_stats_scan_time = scan_time; 1019 + if (scan_time > es_stats->es_stats_max_scan_time) 1020 + es_stats->es_stats_max_scan_time = scan_time; 1021 + if (likely(es_stats->es_stats_shrunk)) 1022 + es_stats->es_stats_shrunk = (nr_shrunk + 1023 + es_stats->es_stats_shrunk*3) / 4; 1024 + else 1025 + es_stats->es_stats_shrunk = nr_shrunk; 1026 + 1027 + trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached, 1028 + nr_skipped, retried); 1032 1029 return nr_shrunk; 1033 1030 } 1034 1031 ··· 1055 1020 struct ext4_sb_info *sbi; 1056 1021 1057 1022 sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); 1058 - nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); 1059 - trace_ext4_es_shrink_enter(sbi->s_sb, sc->nr_to_scan, nr); 1023 + nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); 1024 + trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); 1060 1025 return nr; 1061 1026 } 1062 1027 ··· 1068 1033 int nr_to_scan = sc->nr_to_scan; 1069 1034 int ret, nr_shrunk; 1070 1035 1071 - ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); 1072 - trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret); 1036 + ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); 1037 + trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); 1073 1038 1074 1039 if (!nr_to_scan) 1075 1040 return ret; 1076 1041 1077 1042 nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); 1078 1043 1079 - trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); 1044 + trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret); 1080 1045 return nr_shrunk; 1081 1046 } 1082 1047 1083 - void ext4_es_register_shrinker(struct ext4_sb_info *sbi) 1048 + static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos) 1084 1049 { 1050 + return *pos ? NULL : SEQ_START_TOKEN; 1051 + } 1052 + 1053 + static void * 1054 + ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos) 1055 + { 1056 + return NULL; 1057 + } 1058 + 1059 + static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) 1060 + { 1061 + struct ext4_sb_info *sbi = seq->private; 1062 + struct ext4_es_stats *es_stats = &sbi->s_es_stats; 1063 + struct ext4_inode_info *ei, *max = NULL; 1064 + unsigned int inode_cnt = 0; 1065 + 1066 + if (v != SEQ_START_TOKEN) 1067 + return 0; 1068 + 1069 + /* here we just find an inode that has the max nr. of objects */ 1070 + spin_lock(&sbi->s_es_lru_lock); 1071 + list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) { 1072 + inode_cnt++; 1073 + if (max && max->i_es_all_nr < ei->i_es_all_nr) 1074 + max = ei; 1075 + else if (!max) 1076 + max = ei; 1077 + } 1078 + spin_unlock(&sbi->s_es_lru_lock); 1079 + 1080 + seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", 1081 + percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), 1082 + percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt)); 1083 + seq_printf(seq, " %lu/%lu cache hits/misses\n", 1084 + es_stats->es_stats_cache_hits, 1085 + es_stats->es_stats_cache_misses); 1086 + if (es_stats->es_stats_last_sorted != 0) 1087 + seq_printf(seq, " %u ms last sorted interval\n", 1088 + jiffies_to_msecs(jiffies - 1089 + es_stats->es_stats_last_sorted)); 1090 + if (inode_cnt) 1091 + seq_printf(seq, " %d inodes on lru list\n", inode_cnt); 1092 + 1093 + seq_printf(seq, "average:\n %llu us scan time\n", 1094 + div_u64(es_stats->es_stats_scan_time, 1000)); 1095 + seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk); 1096 + if (inode_cnt) 1097 + seq_printf(seq, 1098 + "maximum:\n %lu inode (%u objects, %u reclaimable)\n" 1099 + " %llu us max scan time\n", 1100 + max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr, 1101 + div_u64(es_stats->es_stats_max_scan_time, 1000)); 1102 + 1103 + return 0; 1104 + } 1105 + 1106 + static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v) 1107 + { 1108 + } 1109 + 1110 + static const struct seq_operations ext4_es_seq_shrinker_info_ops = { 1111 + .start = ext4_es_seq_shrinker_info_start, 1112 + .next = ext4_es_seq_shrinker_info_next, 1113 + .stop = ext4_es_seq_shrinker_info_stop, 1114 + .show = ext4_es_seq_shrinker_info_show, 1115 + }; 1116 + 1117 + static int 1118 + ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file) 1119 + { 1120 + int ret; 1121 + 1122 + ret = seq_open(file, &ext4_es_seq_shrinker_info_ops); 1123 + if (!ret) { 1124 + struct seq_file *m = file->private_data; 1125 + m->private = PDE_DATA(inode); 1126 + } 1127 + 1128 + return ret; 1129 + } 1130 + 1131 + static int 1132 + ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file) 1133 + { 1134 + return seq_release(inode, file); 1135 + } 1136 + 1137 + static const struct file_operations ext4_es_seq_shrinker_info_fops = { 1138 + .owner = THIS_MODULE, 1139 + .open = ext4_es_seq_shrinker_info_open, 1140 + .read = seq_read, 1141 + .llseek = seq_lseek, 1142 + .release = ext4_es_seq_shrinker_info_release, 1143 + }; 1144 + 1145 + int ext4_es_register_shrinker(struct ext4_sb_info *sbi) 1146 + { 1147 + int err; 1148 + 1085 1149 INIT_LIST_HEAD(&sbi->s_es_lru); 1086 1150 spin_lock_init(&sbi->s_es_lru_lock); 1087 - sbi->s_es_last_sorted = 0; 1151 + sbi->s_es_stats.es_stats_last_sorted = 0; 1152 + sbi->s_es_stats.es_stats_shrunk = 0; 1153 + sbi->s_es_stats.es_stats_cache_hits = 0; 1154 + sbi->s_es_stats.es_stats_cache_misses = 0; 1155 + sbi->s_es_stats.es_stats_scan_time = 0; 1156 + sbi->s_es_stats.es_stats_max_scan_time = 0; 1157 + err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL); 1158 + if (err) 1159 + return err; 1160 + err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0, GFP_KERNEL); 1161 + if (err) 1162 + goto err1; 1163 + 1088 1164 sbi->s_es_shrinker.scan_objects = ext4_es_scan; 1089 1165 sbi->s_es_shrinker.count_objects = ext4_es_count; 1090 1166 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; 1091 - register_shrinker(&sbi->s_es_shrinker); 1167 + err = register_shrinker(&sbi->s_es_shrinker); 1168 + if (err) 1169 + goto err2; 1170 + 1171 + if (sbi->s_proc) 1172 + proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc, 1173 + &ext4_es_seq_shrinker_info_fops, sbi); 1174 + 1175 + return 0; 1176 + 1177 + err2: 1178 + percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); 1179 + err1: 1180 + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); 1181 + return err; 1092 1182 } 1093 1183 1094 1184 void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) 1095 1185 { 1186 + if (sbi->s_proc) 1187 + remove_proc_entry("es_shrinker_info", sbi->s_proc); 1188 + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); 1189 + percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); 1096 1190 unregister_shrinker(&sbi->s_es_shrinker); 1097 1191 } 1098 1192

+12 -1

fs/ext4/extents_status.h

··· 64 64 struct extent_status *cache_es; /* recently accessed extent */ 65 65 }; 66 66 67 + struct ext4_es_stats { 68 + unsigned long es_stats_last_sorted; 69 + unsigned long es_stats_shrunk; 70 + unsigned long es_stats_cache_hits; 71 + unsigned long es_stats_cache_misses; 72 + u64 es_stats_scan_time; 73 + u64 es_stats_max_scan_time; 74 + struct percpu_counter es_stats_all_cnt; 75 + struct percpu_counter es_stats_lru_cnt; 76 + }; 77 + 67 78 extern int __init ext4_init_es(void); 68 79 extern void ext4_exit_es(void); 69 80 extern void ext4_es_init_tree(struct ext4_es_tree *tree); ··· 149 138 (pb & ~ES_MASK)); 150 139 } 151 140 152 - extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); 141 + extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); 153 142 extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); 154 143 extern void ext4_es_lru_add(struct inode *inode); 155 144 extern void ext4_es_lru_del(struct inode *inode);

+1 -2

fs/ext4/ialloc.c

··· 1011 1011 spin_unlock(&sbi->s_next_gen_lock); 1012 1012 1013 1013 /* Precompute checksum seed for inode metadata */ 1014 - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 1015 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { 1014 + if (ext4_has_metadata_csum(sb)) { 1016 1015 __u32 csum; 1017 1016 __le32 inum = cpu_to_le32(inode->i_ino); 1018 1017 __le32 gen = cpu_to_le32(inode->i_generation);

+42 -44

fs/ext4/indirect.c

··· 318 318 * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain 319 319 * as described above and return 0. 320 320 */ 321 - static int ext4_alloc_branch(handle_t *handle, struct inode *inode, 322 - ext4_lblk_t iblock, int indirect_blks, 323 - int *blks, ext4_fsblk_t goal, 324 - ext4_lblk_t *offsets, Indirect *branch) 321 + static int ext4_alloc_branch(handle_t *handle, 322 + struct ext4_allocation_request *ar, 323 + int indirect_blks, ext4_lblk_t *offsets, 324 + Indirect *branch) 325 325 { 326 - struct ext4_allocation_request ar; 327 326 struct buffer_head * bh; 328 327 ext4_fsblk_t b, new_blocks[4]; 329 328 __le32 *p; 330 329 int i, j, err, len = 1; 331 330 332 - /* 333 - * Set up for the direct block allocation 334 - */ 335 - memset(&ar, 0, sizeof(ar)); 336 - ar.inode = inode; 337 - ar.len = *blks; 338 - ar.logical = iblock; 339 - if (S_ISREG(inode->i_mode)) 340 - ar.flags = EXT4_MB_HINT_DATA; 341 - 342 331 for (i = 0; i <= indirect_blks; i++) { 343 332 if (i == indirect_blks) { 344 - ar.goal = goal; 345 - new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err); 333 + new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err); 346 334 } else 347 - goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode, 348 - goal, 0, NULL, &err); 335 + ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle, 336 + ar->inode, ar->goal, 337 + ar->flags & EXT4_MB_DELALLOC_RESERVED, 338 + NULL, &err); 349 339 if (err) { 350 340 i--; 351 341 goto failed; ··· 344 354 if (i == 0) 345 355 continue; 346 356 347 - bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]); 357 + bh = branch[i].bh = sb_getblk(ar->inode->i_sb, new_blocks[i-1]); 348 358 if (unlikely(!bh)) { 349 359 err = -ENOMEM; 350 360 goto failed; ··· 362 372 b = new_blocks[i]; 363 373 364 374 if (i == indirect_blks) 365 - len = ar.len; 375 + len = ar->len; 366 376 for (j = 0; j < len; j++) 367 377 *p++ = cpu_to_le32(b++); 368 378 ··· 371 381 unlock_buffer(bh); 372 382 373 383 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 374 - err = ext4_handle_dirty_metadata(handle, inode, bh); 384 + err = ext4_handle_dirty_metadata(handle, ar->inode, bh); 375 385 if (err) 376 386 goto failed; 377 387 } 378 - *blks = ar.len; 379 388 return 0; 380 389 failed: 381 390 for (; i >= 0; i--) { ··· 385 396 * existing before ext4_alloc_branch() was called. 386 397 */ 387 398 if (i > 0 && i != indirect_blks && branch[i].bh) 388 - ext4_forget(handle, 1, inode, branch[i].bh, 399 + ext4_forget(handle, 1, ar->inode, branch[i].bh, 389 400 branch[i].bh->b_blocknr); 390 - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 391 - (i == indirect_blks) ? ar.len : 1, 0); 401 + ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i], 402 + (i == indirect_blks) ? ar->len : 1, 0); 392 403 } 393 404 return err; 394 405 } ··· 408 419 * inode (->i_blocks, etc.). In case of success we end up with the full 409 420 * chain to new block and return 0. 410 421 */ 411 - static int ext4_splice_branch(handle_t *handle, struct inode *inode, 412 - ext4_lblk_t block, Indirect *where, int num, 413 - int blks) 422 + static int ext4_splice_branch(handle_t *handle, 423 + struct ext4_allocation_request *ar, 424 + Indirect *where, int num) 414 425 { 415 426 int i; 416 427 int err = 0; ··· 435 446 * Update the host buffer_head or inode to point to more just allocated 436 447 * direct blocks blocks 437 448 */ 438 - if (num == 0 && blks > 1) { 449 + if (num == 0 && ar->len > 1) { 439 450 current_block = le32_to_cpu(where->key) + 1; 440 - for (i = 1; i < blks; i++) 451 + for (i = 1; i < ar->len; i++) 441 452 *(where->p + i) = cpu_to_le32(current_block++); 442 453 } 443 454 ··· 454 465 */ 455 466 jbd_debug(5, "splicing indirect only\n"); 456 467 BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); 457 - err = ext4_handle_dirty_metadata(handle, inode, where->bh); 468 + err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh); 458 469 if (err) 459 470 goto err_out; 460 471 } else { 461 472 /* 462 473 * OK, we spliced it into the inode itself on a direct block. 463 474 */ 464 - ext4_mark_inode_dirty(handle, inode); 475 + ext4_mark_inode_dirty(handle, ar->inode); 465 476 jbd_debug(5, "splicing direct\n"); 466 477 } 467 478 return err; ··· 473 484 * need to revoke the block, which is why we don't 474 485 * need to set EXT4_FREE_BLOCKS_METADATA. 475 486 */ 476 - ext4_free_blocks(handle, inode, where[i].bh, 0, 1, 487 + ext4_free_blocks(handle, ar->inode, where[i].bh, 0, 1, 477 488 EXT4_FREE_BLOCKS_FORGET); 478 489 } 479 - ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), 480 - blks, 0); 490 + ext4_free_blocks(handle, ar->inode, NULL, le32_to_cpu(where[num].key), 491 + ar->len, 0); 481 492 482 493 return err; 483 494 } ··· 514 525 struct ext4_map_blocks *map, 515 526 int flags) 516 527 { 528 + struct ext4_allocation_request ar; 517 529 int err = -EIO; 518 530 ext4_lblk_t offsets[4]; 519 531 Indirect chain[4]; 520 532 Indirect *partial; 521 - ext4_fsblk_t goal; 522 533 int indirect_blks; 523 534 int blocks_to_boundary = 0; 524 535 int depth; ··· 568 579 return -ENOSPC; 569 580 } 570 581 571 - goal = ext4_find_goal(inode, map->m_lblk, partial); 582 + /* Set up for the direct block allocation */ 583 + memset(&ar, 0, sizeof(ar)); 584 + ar.inode = inode; 585 + ar.logical = map->m_lblk; 586 + if (S_ISREG(inode->i_mode)) 587 + ar.flags = EXT4_MB_HINT_DATA; 588 + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 589 + ar.flags |= EXT4_MB_DELALLOC_RESERVED; 590 + 591 + ar.goal = ext4_find_goal(inode, map->m_lblk, partial); 572 592 573 593 /* the number of blocks need to allocate for [d,t]indirect blocks */ 574 594 indirect_blks = (chain + depth) - partial - 1; ··· 586 588 * Next look up the indirect map to count the totoal number of 587 589 * direct blocks to allocate for this branch. 588 590 */ 589 - count = ext4_blks_to_allocate(partial, indirect_blks, 590 - map->m_len, blocks_to_boundary); 591 + ar.len = ext4_blks_to_allocate(partial, indirect_blks, 592 + map->m_len, blocks_to_boundary); 593 + 591 594 /* 592 595 * Block out ext4_truncate while we alter the tree 593 596 */ 594 - err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, 595 - &count, goal, 597 + err = ext4_alloc_branch(handle, &ar, indirect_blks, 596 598 offsets + (partial - chain), partial); 597 599 598 600 /* ··· 603 605 * may need to return -EAGAIN upwards in the worst case. --sct 604 606 */ 605 607 if (!err) 606 - err = ext4_splice_branch(handle, inode, map->m_lblk, 607 - partial, indirect_blks, count); 608 + err = ext4_splice_branch(handle, &ar, partial, indirect_blks); 608 609 if (err) 609 610 goto cleanup; 610 611 611 612 map->m_flags |= EXT4_MAP_NEW; 612 613 613 614 ext4_update_inode_fsync_trans(handle, inode, 1); 615 + count = ar.len; 614 616 got_it: 615 617 map->m_flags |= EXT4_MAP_MAPPED; 616 618 map->m_pblk = le32_to_cpu(chain[depth-1].key);

+4 -3

fs/ext4/inline.c

··· 594 594 if (ret) { 595 595 unlock_page(page); 596 596 page_cache_release(page); 597 + page = NULL; 597 598 ext4_orphan_add(handle, inode); 598 599 up_write(&EXT4_I(inode)->xattr_sem); 599 600 sem_held = 0; ··· 614 613 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 615 614 goto retry; 616 615 617 - block_commit_write(page, from, to); 616 + if (page) 617 + block_commit_write(page, from, to); 618 618 out: 619 619 if (page) { 620 620 unlock_page(page); ··· 1128 1126 memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, 1129 1127 inline_size - EXT4_INLINE_DOTDOT_SIZE); 1130 1128 1131 - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 1132 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 1129 + if (ext4_has_metadata_csum(inode->i_sb)) 1133 1130 csum_size = sizeof(struct ext4_dir_entry_tail); 1134 1131 1135 1132 inode->i_size = inode->i_sb->s_blocksize;

+67 -66

fs/ext4/inode.c

··· 83 83 84 84 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 85 85 cpu_to_le32(EXT4_OS_LINUX) || 86 - !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 87 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 86 + !ext4_has_metadata_csum(inode->i_sb)) 88 87 return 1; 89 88 90 89 provided = le16_to_cpu(raw->i_checksum_lo); ··· 104 105 105 106 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 106 107 cpu_to_le32(EXT4_OS_LINUX) || 107 - !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 108 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 108 + !ext4_has_metadata_csum(inode->i_sb)) 109 109 return; 110 110 111 111 csum = ext4_inode_csum(inode, raw, ei); ··· 222 224 goto no_delete; 223 225 } 224 226 225 - if (!is_bad_inode(inode)) 226 - dquot_initialize(inode); 227 + if (is_bad_inode(inode)) 228 + goto no_delete; 229 + dquot_initialize(inode); 227 230 228 231 if (ext4_should_order_data(inode)) 229 232 ext4_begin_ordered_truncate(inode, 0); 230 233 truncate_inode_pages_final(&inode->i_data); 231 234 232 235 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); 233 - if (is_bad_inode(inode)) 234 - goto no_delete; 235 236 236 237 /* 237 238 * Protect us against freezing - iput() caller didn't have to have any ··· 587 590 /* 588 591 * New blocks allocate and/or writing to unwritten extent 589 592 * will possibly result in updating i_data, so we take 590 - * the write lock of i_data_sem, and call get_blocks() 593 + * the write lock of i_data_sem, and call get_block() 591 594 * with create == 1 flag. 592 595 */ 593 596 down_write(&EXT4_I(inode)->i_data_sem); 594 597 595 - /* 596 - * if the caller is from delayed allocation writeout path 597 - * we have already reserved fs blocks for allocation 598 - * let the underlying get_block() function know to 599 - * avoid double accounting 600 - */ 601 - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 602 - ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); 603 598 /* 604 599 * We need to check for EXT4 here because migrate 605 600 * could have changed the inode type in between ··· 620 631 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) 621 632 ext4_da_update_reserve_space(inode, retval, 1); 622 633 } 623 - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 624 - ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); 625 634 626 635 if (retval > 0) { 627 636 unsigned int status; ··· 721 734 * `handle' can be NULL if create is zero 722 735 */ 723 736 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 724 - ext4_lblk_t block, int create, int *errp) 737 + ext4_lblk_t block, int create) 725 738 { 726 739 struct ext4_map_blocks map; 727 740 struct buffer_head *bh; 728 - int fatal = 0, err; 741 + int err; 729 742 730 743 J_ASSERT(handle != NULL || create == 0); 731 744 ··· 734 747 err = ext4_map_blocks(handle, inode, &map, 735 748 create ? EXT4_GET_BLOCKS_CREATE : 0); 736 749 737 - /* ensure we send some value back into *errp */ 738 - *errp = 0; 739 - 740 - if (create && err == 0) 741 - err = -ENOSPC; /* should never happen */ 750 + if (err == 0) 751 + return create ? ERR_PTR(-ENOSPC) : NULL; 742 752 if (err < 0) 743 - *errp = err; 744 - if (err <= 0) 745 - return NULL; 753 + return ERR_PTR(err); 746 754 747 755 bh = sb_getblk(inode->i_sb, map.m_pblk); 748 - if (unlikely(!bh)) { 749 - *errp = -ENOMEM; 750 - return NULL; 751 - } 756 + if (unlikely(!bh)) 757 + return ERR_PTR(-ENOMEM); 752 758 if (map.m_flags & EXT4_MAP_NEW) { 753 759 J_ASSERT(create != 0); 754 760 J_ASSERT(handle != NULL); ··· 755 775 */ 756 776 lock_buffer(bh); 757 777 BUFFER_TRACE(bh, "call get_create_access"); 758 - fatal = ext4_journal_get_create_access(handle, bh); 759 - if (!fatal && !buffer_uptodate(bh)) { 778 + err = ext4_journal_get_create_access(handle, bh); 779 + if (unlikely(err)) { 780 + unlock_buffer(bh); 781 + goto errout; 782 + } 783 + if (!buffer_uptodate(bh)) { 760 784 memset(bh->b_data, 0, inode->i_sb->s_blocksize); 761 785 set_buffer_uptodate(bh); 762 786 } 763 787 unlock_buffer(bh); 764 788 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 765 789 err = ext4_handle_dirty_metadata(handle, inode, bh); 766 - if (!fatal) 767 - fatal = err; 768 - } else { 790 + if (unlikely(err)) 791 + goto errout; 792 + } else 769 793 BUFFER_TRACE(bh, "not a new buffer"); 770 - } 771 - if (fatal) { 772 - *errp = fatal; 773 - brelse(bh); 774 - bh = NULL; 775 - } 776 794 return bh; 795 + errout: 796 + brelse(bh); 797 + return ERR_PTR(err); 777 798 } 778 799 779 800 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 780 - ext4_lblk_t block, int create, int *err) 801 + ext4_lblk_t block, int create) 781 802 { 782 803 struct buffer_head *bh; 783 804 784 - bh = ext4_getblk(handle, inode, block, create, err); 785 - if (!bh) 805 + bh = ext4_getblk(handle, inode, block, create); 806 + if (IS_ERR(bh)) 786 807 return bh; 787 - if (buffer_uptodate(bh)) 808 + if (!bh || buffer_uptodate(bh)) 788 809 return bh; 789 810 ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); 790 811 wait_on_buffer(bh); 791 812 if (buffer_uptodate(bh)) 792 813 return bh; 793 814 put_bh(bh); 794 - *err = -EIO; 795 - return NULL; 815 + return ERR_PTR(-EIO); 796 816 } 797 817 798 818 int ext4_walk_page_buffers(handle_t *handle, ··· 1516 1536 } 1517 1537 1518 1538 /* 1519 - * This is a special get_blocks_t callback which is used by 1539 + * This is a special get_block_t callback which is used by 1520 1540 * ext4_da_write_begin(). It will either return mapped block or 1521 1541 * reserve space for a single block. 1522 1542 * ··· 1991 2011 * in data loss. So use reserved blocks to allocate metadata if 1992 2012 * possible. 1993 2013 * 1994 - * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks 1995 - * in question are delalloc blocks. This affects functions in many 1996 - * different parts of the allocation call path. This flag exists 1997 - * primarily because we don't want to change *many* call functions, so 1998 - * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag 1999 - * once the inode's allocation semaphore is taken. 2014 + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if 2015 + * the blocks in question are delalloc blocks. This indicates 2016 + * that the blocks and quotas has already been checked when 2017 + * the data was copied into the page cache. 2000 2018 */ 2001 2019 get_blocks_flags = EXT4_GET_BLOCKS_CREATE | 2002 2020 EXT4_GET_BLOCKS_METADATA_NOFAIL; ··· 2493 2515 return 0; 2494 2516 } 2495 2517 2518 + /* We always reserve for an inode update; the superblock could be there too */ 2519 + static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len) 2520 + { 2521 + if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 2522 + EXT4_FEATURE_RO_COMPAT_LARGE_FILE))) 2523 + return 1; 2524 + 2525 + if (pos + len <= 0x7fffffffULL) 2526 + return 1; 2527 + 2528 + /* We might need to update the superblock to set LARGE_FILE */ 2529 + return 2; 2530 + } 2531 + 2496 2532 static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 2497 2533 loff_t pos, unsigned len, unsigned flags, 2498 2534 struct page **pagep, void **fsdata) ··· 2557 2565 * of file which has an already mapped buffer. 2558 2566 */ 2559 2567 retry_journal: 2560 - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1); 2568 + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 2569 + ext4_da_write_credits(inode, pos, len)); 2561 2570 if (IS_ERR(handle)) { 2562 2571 page_cache_release(page); 2563 2572 return PTR_ERR(handle); ··· 2651 2658 if (copied && new_i_size > EXT4_I(inode)->i_disksize) { 2652 2659 if (ext4_has_inline_data(inode) || 2653 2660 ext4_da_should_update_i_disksize(page, end)) { 2654 - down_write(&EXT4_I(inode)->i_data_sem); 2655 - if (new_i_size > EXT4_I(inode)->i_disksize) 2656 - EXT4_I(inode)->i_disksize = new_i_size; 2657 - up_write(&EXT4_I(inode)->i_data_sem); 2661 + ext4_update_i_disksize(inode, new_i_size); 2658 2662 /* We need to mark inode dirty even if 2659 2663 * new_i_size is less that inode->i_size 2660 2664 * bu greater than i_disksize.(hint delalloc) ··· 3926 3936 ei->i_extra_isize = 0; 3927 3937 3928 3938 /* Precompute checksum seed for inode metadata */ 3929 - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 3930 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { 3939 + if (ext4_has_metadata_csum(sb)) { 3931 3940 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3932 3941 __u32 csum; 3933 3942 __le32 inum = cpu_to_le32(inode->i_ino); ··· 4116 4127 return ERR_PTR(ret); 4117 4128 } 4118 4129 4130 + struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino) 4131 + { 4132 + if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) 4133 + return ERR_PTR(-EIO); 4134 + return ext4_iget(sb, ino); 4135 + } 4136 + 4119 4137 static int ext4_inode_blocks_set(handle_t *handle, 4120 4138 struct ext4_inode *raw_inode, 4121 4139 struct ext4_inode_info *ei) ··· 4222 4226 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); 4223 4227 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); 4224 4228 4225 - if (ext4_inode_blocks_set(handle, raw_inode, ei)) { 4229 + err = ext4_inode_blocks_set(handle, raw_inode, ei); 4230 + if (err) { 4226 4231 spin_unlock(&ei->i_raw_lock); 4227 4232 goto out_brelse; 4228 4233 } ··· 4533 4536 ext4_orphan_del(NULL, inode); 4534 4537 goto err_out; 4535 4538 } 4536 - } else 4539 + } else { 4540 + loff_t oldsize = inode->i_size; 4541 + 4537 4542 i_size_write(inode, attr->ia_size); 4543 + pagecache_isize_extended(inode, oldsize, inode->i_size); 4544 + } 4538 4545 4539 4546 /* 4540 4547 * Blocks are going to be removed from the inode. Wait

+10 -3

fs/ext4/ioctl.c

··· 331 331 if (!inode_owner_or_capable(inode)) 332 332 return -EPERM; 333 333 334 - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 335 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { 334 + if (ext4_has_metadata_csum(inode->i_sb)) { 336 335 ext4_warning(sb, "Setting inode version is not " 337 336 "supported with metadata_csum enabled."); 338 337 return -ENOTTY; ··· 531 532 } 532 533 533 534 case EXT4_IOC_SWAP_BOOT: 535 + { 536 + int err; 534 537 if (!(filp->f_mode & FMODE_WRITE)) 535 538 return -EBADF; 536 - return swap_inode_boot_loader(sb, inode); 539 + err = mnt_want_write_file(filp); 540 + if (err) 541 + return err; 542 + err = swap_inode_boot_loader(sb, inode); 543 + mnt_drop_write_file(filp); 544 + return err; 545 + } 537 546 538 547 case EXT4_IOC_RESIZE_FS: { 539 548 ext4_fsblk_t n_blocks_count;

+3 -12

fs/ext4/mballoc.c

··· 3155 3155 "start %lu, size %lu, fe_logical %lu", 3156 3156 (unsigned long) start, (unsigned long) size, 3157 3157 (unsigned long) ac->ac_o_ex.fe_logical); 3158 + BUG(); 3158 3159 } 3159 - BUG_ON(start + size <= ac->ac_o_ex.fe_logical && 3160 - start > ac->ac_o_ex.fe_logical); 3161 3160 BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 3162 3161 3163 3162 /* now prepare goal request */ ··· 4409 4410 if (IS_NOQUOTA(ar->inode)) 4410 4411 ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; 4411 4412 4412 - /* 4413 - * For delayed allocation, we could skip the ENOSPC and 4414 - * EDQUOT check, as blocks and quotas have been already 4415 - * reserved when data being copied into pagecache. 4416 - */ 4417 - if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) 4418 - ar->flags |= EXT4_MB_DELALLOC_RESERVED; 4419 - else { 4413 + if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { 4420 4414 /* Without delayed allocation we need to verify 4421 4415 * there is enough free blocks to do block allocation 4422 4416 * and verify allocation doesn't exceed the quota limits. ··· 4520 4528 if (inquota && ar->len < inquota) 4521 4529 dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); 4522 4530 if (!ar->len) { 4523 - if (!ext4_test_inode_state(ar->inode, 4524 - EXT4_STATE_DELALLOC_RESERVED)) 4531 + if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) 4525 4532 /* release all the reserved blocks if non delalloc */ 4526 4533 percpu_counter_sub(&sbi->s_dirtyclusters_counter, 4527 4534 reserv_clstrs);

+4 -7

fs/ext4/migrate.c

··· 41 41 ext4_ext_store_pblock(&newext, lb->first_pblock); 42 42 /* Locking only for convinience since we are operating on temp inode */ 43 43 down_write(&EXT4_I(inode)->i_data_sem); 44 - path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0); 45 - 44 + path = ext4_find_extent(inode, lb->first_block, NULL, 0); 46 45 if (IS_ERR(path)) { 47 46 retval = PTR_ERR(path); 48 47 path = NULL; ··· 80 81 goto err_out; 81 82 } 82 83 } 83 - retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0); 84 + retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0); 84 85 err_out: 85 86 up_write((&EXT4_I(inode)->i_data_sem)); 86 - if (path) { 87 - ext4_ext_drop_refs(path); 88 - kfree(path); 89 - } 87 + ext4_ext_drop_refs(path); 88 + kfree(path); 90 89 lb->first_pblock = 0; 91 90 return retval; 92 91 }

+2 -4

fs/ext4/mmp.c

··· 20 20 21 21 static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) 22 22 { 23 - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 24 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 23 + if (!ext4_has_metadata_csum(sb)) 25 24 return 1; 26 25 27 26 return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp); ··· 28 29 29 30 static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) 30 31 { 31 - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 32 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 32 + if (!ext4_has_metadata_csum(sb)) 33 33 return; 34 34 35 35 mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);

+122 -944

fs/ext4/move_extent.c

··· 27 27 * @lblock: logical block number to find an extent path 28 28 * @path: pointer to an extent path pointer (for output) 29 29 * 30 - * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value 30 + * ext4_find_extent wrapper. Return 0 on success, or a negative error value 31 31 * on failure. 32 32 */ 33 33 static inline int 34 34 get_ext_path(struct inode *inode, ext4_lblk_t lblock, 35 - struct ext4_ext_path **orig_path) 35 + struct ext4_ext_path **ppath) 36 36 { 37 - int ret = 0; 38 37 struct ext4_ext_path *path; 39 38 40 - path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE); 39 + path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE); 41 40 if (IS_ERR(path)) 42 - ret = PTR_ERR(path); 43 - else if (path[ext_depth(inode)].p_ext == NULL) 44 - ret = -ENODATA; 45 - else 46 - *orig_path = path; 47 - 48 - return ret; 49 - } 50 - 51 - /** 52 - * copy_extent_status - Copy the extent's initialization status 53 - * 54 - * @src: an extent for getting initialize status 55 - * @dest: an extent to be set the status 56 - */ 57 - static void 58 - copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest) 59 - { 60 - if (ext4_ext_is_unwritten(src)) 61 - ext4_ext_mark_unwritten(dest); 62 - else 63 - dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest)); 64 - } 65 - 66 - /** 67 - * mext_next_extent - Search for the next extent and set it to "extent" 68 - * 69 - * @inode: inode which is searched 70 - * @path: this will obtain data for the next extent 71 - * @extent: pointer to the next extent we have just gotten 72 - * 73 - * Search the next extent in the array of ext4_ext_path structure (@path) 74 - * and set it to ext4_extent structure (@extent). In addition, the member of 75 - * @path (->p_ext) also points the next extent. Return 0 on success, 1 if 76 - * ext4_ext_path structure refers to the last extent, or a negative error 77 - * value on failure. 78 - */ 79 - int 80 - mext_next_extent(struct inode *inode, struct ext4_ext_path *path, 81 - struct ext4_extent **extent) 82 - { 83 - struct ext4_extent_header *eh; 84 - int ppos, leaf_ppos = path->p_depth; 85 - 86 - ppos = leaf_ppos; 87 - if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { 88 - /* leaf block */ 89 - *extent = ++path[ppos].p_ext; 90 - path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); 91 - return 0; 41 + return PTR_ERR(path); 42 + if (path[ext_depth(inode)].p_ext == NULL) { 43 + ext4_ext_drop_refs(path); 44 + kfree(path); 45 + *ppath = NULL; 46 + return -ENODATA; 92 47 } 93 - 94 - while (--ppos >= 0) { 95 - if (EXT_LAST_INDEX(path[ppos].p_hdr) > 96 - path[ppos].p_idx) { 97 - int cur_ppos = ppos; 98 - 99 - /* index block */ 100 - path[ppos].p_idx++; 101 - path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); 102 - if (path[ppos+1].p_bh) 103 - brelse(path[ppos+1].p_bh); 104 - path[ppos+1].p_bh = 105 - sb_bread(inode->i_sb, path[ppos].p_block); 106 - if (!path[ppos+1].p_bh) 107 - return -EIO; 108 - path[ppos+1].p_hdr = 109 - ext_block_hdr(path[ppos+1].p_bh); 110 - 111 - /* Halfway index block */ 112 - while (++cur_ppos < leaf_ppos) { 113 - path[cur_ppos].p_idx = 114 - EXT_FIRST_INDEX(path[cur_ppos].p_hdr); 115 - path[cur_ppos].p_block = 116 - ext4_idx_pblock(path[cur_ppos].p_idx); 117 - if (path[cur_ppos+1].p_bh) 118 - brelse(path[cur_ppos+1].p_bh); 119 - path[cur_ppos+1].p_bh = sb_bread(inode->i_sb, 120 - path[cur_ppos].p_block); 121 - if (!path[cur_ppos+1].p_bh) 122 - return -EIO; 123 - path[cur_ppos+1].p_hdr = 124 - ext_block_hdr(path[cur_ppos+1].p_bh); 125 - } 126 - 127 - path[leaf_ppos].p_ext = *extent = NULL; 128 - 129 - eh = path[leaf_ppos].p_hdr; 130 - if (le16_to_cpu(eh->eh_entries) == 0) 131 - /* empty leaf is found */ 132 - return -ENODATA; 133 - 134 - /* leaf block */ 135 - path[leaf_ppos].p_ext = *extent = 136 - EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); 137 - path[leaf_ppos].p_block = 138 - ext4_ext_pblock(path[leaf_ppos].p_ext); 139 - return 0; 140 - } 141 - } 142 - /* We found the last extent */ 143 - return 1; 48 + *ppath = path; 49 + return 0; 144 50 } 145 51 146 52 /** ··· 84 178 } 85 179 86 180 /** 87 - * mext_insert_across_blocks - Insert extents across leaf block 88 - * 89 - * @handle: journal handle 90 - * @orig_inode: original inode 91 - * @o_start: first original extent to be changed 92 - * @o_end: last original extent to be changed 93 - * @start_ext: first new extent to be inserted 94 - * @new_ext: middle of new extent to be inserted 95 - * @end_ext: last new extent to be inserted 96 - * 97 - * Allocate a new leaf block and insert extents into it. Return 0 on success, 98 - * or a negative error value on failure. 99 - */ 100 - static int 101 - mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, 102 - struct ext4_extent *o_start, struct ext4_extent *o_end, 103 - struct ext4_extent *start_ext, struct ext4_extent *new_ext, 104 - struct ext4_extent *end_ext) 105 - { 106 - struct ext4_ext_path *orig_path = NULL; 107 - ext4_lblk_t eblock = 0; 108 - int new_flag = 0; 109 - int end_flag = 0; 110 - int err = 0; 111 - 112 - if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) { 113 - if (o_start == o_end) { 114 - 115 - /* start_ext new_ext end_ext 116 - * donor |---------|-----------|--------| 117 - * orig |------------------------------| 118 - */ 119 - end_flag = 1; 120 - } else { 121 - 122 - /* start_ext new_ext end_ext 123 - * donor |---------|----------|---------| 124 - * orig |---------------|--------------| 125 - */ 126 - o_end->ee_block = end_ext->ee_block; 127 - o_end->ee_len = end_ext->ee_len; 128 - ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext)); 129 - } 130 - 131 - o_start->ee_len = start_ext->ee_len; 132 - eblock = le32_to_cpu(start_ext->ee_block); 133 - new_flag = 1; 134 - 135 - } else if (start_ext->ee_len && new_ext->ee_len && 136 - !end_ext->ee_len && o_start == o_end) { 137 - 138 - /* start_ext new_ext 139 - * donor |--------------|---------------| 140 - * orig |------------------------------| 141 - */ 142 - o_start->ee_len = start_ext->ee_len; 143 - eblock = le32_to_cpu(start_ext->ee_block); 144 - new_flag = 1; 145 - 146 - } else if (!start_ext->ee_len && new_ext->ee_len && 147 - end_ext->ee_len && o_start == o_end) { 148 - 149 - /* new_ext end_ext 150 - * donor |--------------|---------------| 151 - * orig |------------------------------| 152 - */ 153 - o_end->ee_block = end_ext->ee_block; 154 - o_end->ee_len = end_ext->ee_len; 155 - ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext)); 156 - 157 - /* 158 - * Set 0 to the extent block if new_ext was 159 - * the first block. 160 - */ 161 - if (new_ext->ee_block) 162 - eblock = le32_to_cpu(new_ext->ee_block); 163 - 164 - new_flag = 1; 165 - } else { 166 - ext4_debug("ext4 move extent: Unexpected insert case\n"); 167 - return -EIO; 168 - } 169 - 170 - if (new_flag) { 171 - err = get_ext_path(orig_inode, eblock, &orig_path); 172 - if (err) 173 - goto out; 174 - 175 - if (ext4_ext_insert_extent(handle, orig_inode, 176 - orig_path, new_ext, 0)) 177 - goto out; 178 - } 179 - 180 - if (end_flag) { 181 - err = get_ext_path(orig_inode, 182 - le32_to_cpu(end_ext->ee_block) - 1, &orig_path); 183 - if (err) 184 - goto out; 185 - 186 - if (ext4_ext_insert_extent(handle, orig_inode, 187 - orig_path, end_ext, 0)) 188 - goto out; 189 - } 190 - out: 191 - if (orig_path) { 192 - ext4_ext_drop_refs(orig_path); 193 - kfree(orig_path); 194 - } 195 - 196 - return err; 197 - 198 - } 199 - 200 - /** 201 - * mext_insert_inside_block - Insert new extent to the extent block 202 - * 203 - * @o_start: first original extent to be moved 204 - * @o_end: last original extent to be moved 205 - * @start_ext: first new extent to be inserted 206 - * @new_ext: middle of new extent to be inserted 207 - * @end_ext: last new extent to be inserted 208 - * @eh: extent header of target leaf block 209 - * @range_to_move: used to decide how to insert extent 210 - * 211 - * Insert extents into the leaf block. The extent (@o_start) is overwritten 212 - * by inserted extents. 213 - */ 214 - static void 215 - mext_insert_inside_block(struct ext4_extent *o_start, 216 - struct ext4_extent *o_end, 217 - struct ext4_extent *start_ext, 218 - struct ext4_extent *new_ext, 219 - struct ext4_extent *end_ext, 220 - struct ext4_extent_header *eh, 221 - int range_to_move) 222 - { 223 - int i = 0; 224 - unsigned long len; 225 - 226 - /* Move the existing extents */ 227 - if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) { 228 - len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) - 229 - (unsigned long)(o_end + 1); 230 - memmove(o_end + 1 + range_to_move, o_end + 1, len); 231 - } 232 - 233 - /* Insert start entry */ 234 - if (start_ext->ee_len) 235 - o_start[i++].ee_len = start_ext->ee_len; 236 - 237 - /* Insert new entry */ 238 - if (new_ext->ee_len) { 239 - o_start[i] = *new_ext; 240 - ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext)); 241 - } 242 - 243 - /* Insert end entry */ 244 - if (end_ext->ee_len) 245 - o_start[i] = *end_ext; 246 - 247 - /* Increment the total entries counter on the extent block */ 248 - le16_add_cpu(&eh->eh_entries, range_to_move); 249 - } 250 - 251 - /** 252 - * mext_insert_extents - Insert new extent 253 - * 254 - * @handle: journal handle 255 - * @orig_inode: original inode 256 - * @orig_path: path indicates first extent to be changed 257 - * @o_start: first original extent to be changed 258 - * @o_end: last original extent to be changed 259 - * @start_ext: first new extent to be inserted 260 - * @new_ext: middle of new extent to be inserted 261 - * @end_ext: last new extent to be inserted 262 - * 263 - * Call the function to insert extents. If we cannot add more extents into 264 - * the leaf block, we call mext_insert_across_blocks() to create a 265 - * new leaf block. Otherwise call mext_insert_inside_block(). Return 0 266 - * on success, or a negative error value on failure. 267 - */ 268 - static int 269 - mext_insert_extents(handle_t *handle, struct inode *orig_inode, 270 - struct ext4_ext_path *orig_path, 271 - struct ext4_extent *o_start, 272 - struct ext4_extent *o_end, 273 - struct ext4_extent *start_ext, 274 - struct ext4_extent *new_ext, 275 - struct ext4_extent *end_ext) 276 - { 277 - struct ext4_extent_header *eh; 278 - unsigned long need_slots, slots_range; 279 - int range_to_move, depth, ret; 280 - 281 - /* 282 - * The extents need to be inserted 283 - * start_extent + new_extent + end_extent. 284 - */ 285 - need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) + 286 - (new_ext->ee_len ? 1 : 0); 287 - 288 - /* The number of slots between start and end */ 289 - slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1) 290 - / sizeof(struct ext4_extent); 291 - 292 - /* Range to move the end of extent */ 293 - range_to_move = need_slots - slots_range; 294 - depth = orig_path->p_depth; 295 - orig_path += depth; 296 - eh = orig_path->p_hdr; 297 - 298 - if (depth) { 299 - /* Register to journal */ 300 - BUFFER_TRACE(orig_path->p_bh, "get_write_access"); 301 - ret = ext4_journal_get_write_access(handle, orig_path->p_bh); 302 - if (ret) 303 - return ret; 304 - } 305 - 306 - /* Expansion */ 307 - if (range_to_move > 0 && 308 - (range_to_move > le16_to_cpu(eh->eh_max) 309 - - le16_to_cpu(eh->eh_entries))) { 310 - 311 - ret = mext_insert_across_blocks(handle, orig_inode, o_start, 312 - o_end, start_ext, new_ext, end_ext); 313 - if (ret < 0) 314 - return ret; 315 - } else 316 - mext_insert_inside_block(o_start, o_end, start_ext, new_ext, 317 - end_ext, eh, range_to_move); 318 - 319 - return ext4_ext_dirty(handle, orig_inode, orig_path); 320 - } 321 - 322 - /** 323 - * mext_leaf_block - Move one leaf extent block into the inode. 324 - * 325 - * @handle: journal handle 326 - * @orig_inode: original inode 327 - * @orig_path: path indicates first extent to be changed 328 - * @dext: donor extent 329 - * @from: start offset on the target file 330 - * 331 - * In order to insert extents into the leaf block, we must divide the extent 332 - * in the leaf block into three extents. The one is located to be inserted 333 - * extents, and the others are located around it. 334 - * 335 - * Therefore, this function creates structures to save extents of the leaf 336 - * block, and inserts extents by calling mext_insert_extents() with 337 - * created extents. Return 0 on success, or a negative error value on failure. 338 - */ 339 - static int 340 - mext_leaf_block(handle_t *handle, struct inode *orig_inode, 341 - struct ext4_ext_path *orig_path, struct ext4_extent *dext, 342 - ext4_lblk_t *from) 343 - { 344 - struct ext4_extent *oext, *o_start, *o_end, *prev_ext; 345 - struct ext4_extent new_ext, start_ext, end_ext; 346 - ext4_lblk_t new_ext_end; 347 - int oext_alen, new_ext_alen, end_ext_alen; 348 - int depth = ext_depth(orig_inode); 349 - int ret; 350 - 351 - start_ext.ee_block = end_ext.ee_block = 0; 352 - o_start = o_end = oext = orig_path[depth].p_ext; 353 - oext_alen = ext4_ext_get_actual_len(oext); 354 - start_ext.ee_len = end_ext.ee_len = 0; 355 - 356 - new_ext.ee_block = cpu_to_le32(*from); 357 - ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext)); 358 - new_ext.ee_len = dext->ee_len; 359 - new_ext_alen = ext4_ext_get_actual_len(&new_ext); 360 - new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; 361 - 362 - /* 363 - * Case: original extent is first 364 - * oext |--------| 365 - * new_ext |--| 366 - * start_ext |--| 367 - */ 368 - if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) && 369 - le32_to_cpu(new_ext.ee_block) < 370 - le32_to_cpu(oext->ee_block) + oext_alen) { 371 - start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) - 372 - le32_to_cpu(oext->ee_block)); 373 - start_ext.ee_block = oext->ee_block; 374 - copy_extent_status(oext, &start_ext); 375 - } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) { 376 - prev_ext = oext - 1; 377 - /* 378 - * We can merge new_ext into previous extent, 379 - * if these are contiguous and same extent type. 380 - */ 381 - if (ext4_can_extents_be_merged(orig_inode, prev_ext, 382 - &new_ext)) { 383 - o_start = prev_ext; 384 - start_ext.ee_len = cpu_to_le16( 385 - ext4_ext_get_actual_len(prev_ext) + 386 - new_ext_alen); 387 - start_ext.ee_block = oext->ee_block; 388 - copy_extent_status(prev_ext, &start_ext); 389 - new_ext.ee_len = 0; 390 - } 391 - } 392 - 393 - /* 394 - * Case: new_ext_end must be less than oext 395 - * oext |-----------| 396 - * new_ext |-------| 397 - */ 398 - if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { 399 - EXT4_ERROR_INODE(orig_inode, 400 - "new_ext_end(%u) should be less than or equal to " 401 - "oext->ee_block(%u) + oext_alen(%d) - 1", 402 - new_ext_end, le32_to_cpu(oext->ee_block), 403 - oext_alen); 404 - ret = -EIO; 405 - goto out; 406 - } 407 - 408 - /* 409 - * Case: new_ext is smaller than original extent 410 - * oext |---------------| 411 - * new_ext |-----------| 412 - * end_ext |---| 413 - */ 414 - if (le32_to_cpu(oext->ee_block) <= new_ext_end && 415 - new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) { 416 - end_ext.ee_len = 417 - cpu_to_le16(le32_to_cpu(oext->ee_block) + 418 - oext_alen - 1 - new_ext_end); 419 - copy_extent_status(oext, &end_ext); 420 - end_ext_alen = ext4_ext_get_actual_len(&end_ext); 421 - ext4_ext_store_pblock(&end_ext, 422 - (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen)); 423 - end_ext.ee_block = 424 - cpu_to_le32(le32_to_cpu(o_end->ee_block) + 425 - oext_alen - end_ext_alen); 426 - } 427 - 428 - ret = mext_insert_extents(handle, orig_inode, orig_path, o_start, 429 - o_end, &start_ext, &new_ext, &end_ext); 430 - out: 431 - return ret; 432 - } 433 - 434 - /** 435 - * mext_calc_swap_extents - Calculate extents for extent swapping. 436 - * 437 - * @tmp_dext: the extent that will belong to the original inode 438 - * @tmp_oext: the extent that will belong to the donor inode 439 - * @orig_off: block offset of original inode 440 - * @donor_off: block offset of donor inode 441 - * @max_count: the maximum length of extents 442 - * 443 - * Return 0 on success, or a negative error value on failure. 444 - */ 445 - static int 446 - mext_calc_swap_extents(struct ext4_extent *tmp_dext, 447 - struct ext4_extent *tmp_oext, 448 - ext4_lblk_t orig_off, ext4_lblk_t donor_off, 449 - ext4_lblk_t max_count) 450 - { 451 - ext4_lblk_t diff, orig_diff; 452 - struct ext4_extent dext_old, oext_old; 453 - 454 - BUG_ON(orig_off != donor_off); 455 - 456 - /* original and donor extents have to cover the same block offset */ 457 - if (orig_off < le32_to_cpu(tmp_oext->ee_block) || 458 - le32_to_cpu(tmp_oext->ee_block) + 459 - ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off) 460 - return -ENODATA; 461 - 462 - if (orig_off < le32_to_cpu(tmp_dext->ee_block) || 463 - le32_to_cpu(tmp_dext->ee_block) + 464 - ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off) 465 - return -ENODATA; 466 - 467 - dext_old = *tmp_dext; 468 - oext_old = *tmp_oext; 469 - 470 - /* When tmp_dext is too large, pick up the target range. */ 471 - diff = donor_off - le32_to_cpu(tmp_dext->ee_block); 472 - 473 - ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff); 474 - le32_add_cpu(&tmp_dext->ee_block, diff); 475 - le16_add_cpu(&tmp_dext->ee_len, -diff); 476 - 477 - if (max_count < ext4_ext_get_actual_len(tmp_dext)) 478 - tmp_dext->ee_len = cpu_to_le16(max_count); 479 - 480 - orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block); 481 - ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff); 482 - 483 - /* Adjust extent length if donor extent is larger than orig */ 484 - if (ext4_ext_get_actual_len(tmp_dext) > 485 - ext4_ext_get_actual_len(tmp_oext) - orig_diff) 486 - tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) - 487 - orig_diff); 488 - 489 - tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext)); 490 - 491 - copy_extent_status(&oext_old, tmp_dext); 492 - copy_extent_status(&dext_old, tmp_oext); 493 - 494 - return 0; 495 - } 496 - 497 - /** 498 181 * mext_check_coverage - Check that all extents in range has the same type 499 182 * 500 183 * @inode: inode in question ··· 114 619 } 115 620 ret = 1; 116 621 out: 117 - if (path) { 118 - ext4_ext_drop_refs(path); 119 - kfree(path); 120 - } 622 + ext4_ext_drop_refs(path); 623 + kfree(path); 121 624 return ret; 122 - } 123 - 124 - /** 125 - * mext_replace_branches - Replace original extents with new extents 126 - * 127 - * @handle: journal handle 128 - * @orig_inode: original inode 129 - * @donor_inode: donor inode 130 - * @from: block offset of orig_inode 131 - * @count: block count to be replaced 132 - * @err: pointer to save return value 133 - * 134 - * Replace original inode extents and donor inode extents page by page. 135 - * We implement this replacement in the following three steps: 136 - * 1. Save the block information of original and donor inodes into 137 - * dummy extents. 138 - * 2. Change the block information of original inode to point at the 139 - * donor inode blocks. 140 - * 3. Change the block information of donor inode to point at the saved 141 - * original inode blocks in the dummy extents. 142 - * 143 - * Return replaced block count. 144 - */ 145 - static int 146 - mext_replace_branches(handle_t *handle, struct inode *orig_inode, 147 - struct inode *donor_inode, ext4_lblk_t from, 148 - ext4_lblk_t count, int *err) 149 - { 150 - struct ext4_ext_path *orig_path = NULL; 151 - struct ext4_ext_path *donor_path = NULL; 152 - struct ext4_extent *oext, *dext; 153 - struct ext4_extent tmp_dext, tmp_oext; 154 - ext4_lblk_t orig_off = from, donor_off = from; 155 - int depth; 156 - int replaced_count = 0; 157 - int dext_alen; 158 - 159 - *err = ext4_es_remove_extent(orig_inode, from, count); 160 - if (*err) 161 - goto out; 162 - 163 - *err = ext4_es_remove_extent(donor_inode, from, count); 164 - if (*err) 165 - goto out; 166 - 167 - /* Get the original extent for the block "orig_off" */ 168 - *err = get_ext_path(orig_inode, orig_off, &orig_path); 169 - if (*err) 170 - goto out; 171 - 172 - /* Get the donor extent for the head */ 173 - *err = get_ext_path(donor_inode, donor_off, &donor_path); 174 - if (*err) 175 - goto out; 176 - depth = ext_depth(orig_inode); 177 - oext = orig_path[depth].p_ext; 178 - tmp_oext = *oext; 179 - 180 - depth = ext_depth(donor_inode); 181 - dext = donor_path[depth].p_ext; 182 - if (unlikely(!dext)) 183 - goto missing_donor_extent; 184 - tmp_dext = *dext; 185 - 186 - *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 187 - donor_off, count); 188 - if (*err) 189 - goto out; 190 - 191 - /* Loop for the donor extents */ 192 - while (1) { 193 - /* The extent for donor must be found. */ 194 - if (unlikely(!dext)) { 195 - missing_donor_extent: 196 - EXT4_ERROR_INODE(donor_inode, 197 - "The extent for donor must be found"); 198 - *err = -EIO; 199 - goto out; 200 - } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { 201 - EXT4_ERROR_INODE(donor_inode, 202 - "Donor offset(%u) and the first block of donor " 203 - "extent(%u) should be equal", 204 - donor_off, 205 - le32_to_cpu(tmp_dext.ee_block)); 206 - *err = -EIO; 207 - goto out; 208 - } 209 - 210 - /* Set donor extent to orig extent */ 211 - *err = mext_leaf_block(handle, orig_inode, 212 - orig_path, &tmp_dext, &orig_off); 213 - if (*err) 214 - goto out; 215 - 216 - /* Set orig extent to donor extent */ 217 - *err = mext_leaf_block(handle, donor_inode, 218 - donor_path, &tmp_oext, &donor_off); 219 - if (*err) 220 - goto out; 221 - 222 - dext_alen = ext4_ext_get_actual_len(&tmp_dext); 223 - replaced_count += dext_alen; 224 - donor_off += dext_alen; 225 - orig_off += dext_alen; 226 - 227 - BUG_ON(replaced_count > count); 228 - /* Already moved the expected blocks */ 229 - if (replaced_count >= count) 230 - break; 231 - 232 - if (orig_path) 233 - ext4_ext_drop_refs(orig_path); 234 - *err = get_ext_path(orig_inode, orig_off, &orig_path); 235 - if (*err) 236 - goto out; 237 - depth = ext_depth(orig_inode); 238 - oext = orig_path[depth].p_ext; 239 - tmp_oext = *oext; 240 - 241 - if (donor_path) 242 - ext4_ext_drop_refs(donor_path); 243 - *err = get_ext_path(donor_inode, donor_off, &donor_path); 244 - if (*err) 245 - goto out; 246 - depth = ext_depth(donor_inode); 247 - dext = donor_path[depth].p_ext; 248 - tmp_dext = *dext; 249 - 250 - *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 251 - donor_off, count - replaced_count); 252 - if (*err) 253 - goto out; 254 - } 255 - 256 - out: 257 - if (orig_path) { 258 - ext4_ext_drop_refs(orig_path); 259 - kfree(orig_path); 260 - } 261 - if (donor_path) { 262 - ext4_ext_drop_refs(donor_path); 263 - kfree(donor_path); 264 - } 265 - 266 - return replaced_count; 267 625 } 268 626 269 627 /** ··· 124 776 * 125 777 * @inode1: the inode structure 126 778 * @inode2: the inode structure 127 - * @index: page index 779 + * @index1: page index 780 + * @index2: page index 128 781 * @page: result page vector 129 782 * 130 783 * Grab two locked pages for inode's by inode order 131 784 */ 132 785 static int 133 786 mext_page_double_lock(struct inode *inode1, struct inode *inode2, 134 - pgoff_t index, struct page *page[2]) 787 + pgoff_t index1, pgoff_t index2, struct page *page[2]) 135 788 { 136 789 struct address_space *mapping[2]; 137 790 unsigned fl = AOP_FLAG_NOFS; ··· 142 793 mapping[0] = inode1->i_mapping; 143 794 mapping[1] = inode2->i_mapping; 144 795 } else { 796 + pgoff_t tmp = index1; 797 + index1 = index2; 798 + index2 = tmp; 145 799 mapping[0] = inode2->i_mapping; 146 800 mapping[1] = inode1->i_mapping; 147 801 } 148 802 149 - page[0] = grab_cache_page_write_begin(mapping[0], index, fl); 803 + page[0] = grab_cache_page_write_begin(mapping[0], index1, fl); 150 804 if (!page[0]) 151 805 return -ENOMEM; 152 806 153 - page[1] = grab_cache_page_write_begin(mapping[1], index, fl); 807 + page[1] = grab_cache_page_write_begin(mapping[1], index2, fl); 154 808 if (!page[1]) { 155 809 unlock_page(page[0]); 156 810 page_cache_release(page[0]); ··· 245 893 * @o_filp: file structure of original file 246 894 * @donor_inode: donor inode 247 895 * @orig_page_offset: page index on original file 896 + * @donor_page_offset: page index on donor file 248 897 * @data_offset_in_page: block index where data swapping starts 249 898 * @block_len_in_page: the number of blocks to be swapped 250 899 * @unwritten: orig extent is unwritten or not 251 900 * @err: pointer to save return value 252 901 * 253 902 * Save the data in original inode blocks and replace original inode extents 254 - * with donor inode extents by calling mext_replace_branches(). 903 + * with donor inode extents by calling ext4_swap_extents(). 255 904 * Finally, write out the saved data in new original inode blocks. Return 256 905 * replaced block count. 257 906 */ 258 907 static int 259 908 move_extent_per_page(struct file *o_filp, struct inode *donor_inode, 260 - pgoff_t orig_page_offset, int data_offset_in_page, 261 - int block_len_in_page, int unwritten, int *err) 909 + pgoff_t orig_page_offset, pgoff_t donor_page_offset, 910 + int data_offset_in_page, 911 + int block_len_in_page, int unwritten, int *err) 262 912 { 263 913 struct inode *orig_inode = file_inode(o_filp); 264 914 struct page *pagep[2] = {NULL, NULL}; 265 915 handle_t *handle; 266 - ext4_lblk_t orig_blk_offset; 916 + ext4_lblk_t orig_blk_offset, donor_blk_offset; 267 917 unsigned long blocksize = orig_inode->i_sb->s_blocksize; 268 918 unsigned int w_flags = 0; 269 919 unsigned int tmp_data_size, data_size, replaced_size; ··· 293 939 orig_blk_offset = orig_page_offset * blocks_per_page + 294 940 data_offset_in_page; 295 941 942 + donor_blk_offset = donor_page_offset * blocks_per_page + 943 + data_offset_in_page; 944 + 296 945 /* Calculate data_size */ 297 946 if ((orig_blk_offset + block_len_in_page - 1) == 298 947 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { ··· 316 959 replaced_size = data_size; 317 960 318 961 *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset, 319 - pagep); 962 + donor_page_offset, pagep); 320 963 if (unlikely(*err < 0)) 321 964 goto stop_journal; 322 965 /* ··· 335 978 if (*err) 336 979 goto drop_data_sem; 337 980 338 - unwritten &= mext_check_coverage(donor_inode, orig_blk_offset, 981 + unwritten &= mext_check_coverage(donor_inode, donor_blk_offset, 339 982 block_len_in_page, 1, err); 340 983 if (*err) 341 984 goto drop_data_sem; ··· 351 994 *err = -EBUSY; 352 995 goto drop_data_sem; 353 996 } 354 - replaced_count = mext_replace_branches(handle, orig_inode, 355 - donor_inode, orig_blk_offset, 356 - block_len_in_page, err); 997 + replaced_count = ext4_swap_extents(handle, orig_inode, 998 + donor_inode, orig_blk_offset, 999 + donor_blk_offset, 1000 + block_len_in_page, 1, err); 357 1001 drop_data_sem: 358 1002 ext4_double_up_write_data_sem(orig_inode, donor_inode); 359 1003 goto unlock_pages; ··· 372 1014 goto unlock_pages; 373 1015 } 374 1016 ext4_double_down_write_data_sem(orig_inode, donor_inode); 375 - replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, 376 - orig_blk_offset, 377 - block_len_in_page, err); 1017 + replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, 1018 + orig_blk_offset, donor_blk_offset, 1019 + block_len_in_page, 1, err); 378 1020 ext4_double_up_write_data_sem(orig_inode, donor_inode); 379 1021 if (*err) { 380 1022 if (replaced_count) { ··· 419 1061 * Try to swap extents to it's original places 420 1062 */ 421 1063 ext4_double_down_write_data_sem(orig_inode, donor_inode); 422 - replaced_count = mext_replace_branches(handle, donor_inode, orig_inode, 423 - orig_blk_offset, 424 - block_len_in_page, &err2); 1064 + replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode, 1065 + orig_blk_offset, donor_blk_offset, 1066 + block_len_in_page, 0, &err2); 425 1067 ext4_double_up_write_data_sem(orig_inode, donor_inode); 426 1068 if (replaced_count != block_len_in_page) { 427 1069 EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), ··· 451 1093 struct inode *donor_inode, __u64 orig_start, 452 1094 __u64 donor_start, __u64 *len) 453 1095 { 454 - ext4_lblk_t orig_blocks, donor_blocks; 1096 + __u64 orig_eof, donor_eof; 455 1097 unsigned int blkbits = orig_inode->i_blkbits; 456 1098 unsigned int blocksize = 1 << blkbits; 1099 + 1100 + orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits; 1101 + donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits; 1102 + 457 1103 458 1104 if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { 459 1105 ext4_debug("ext4 move extent: suid or sgid is set" ··· 474 1112 ext4_debug("ext4 move extent: The argument files should " 475 1113 "not be swapfile [ino:orig %lu, donor %lu]\n", 476 1114 orig_inode->i_ino, donor_inode->i_ino); 477 - return -EINVAL; 1115 + return -EBUSY; 478 1116 } 479 1117 480 1118 /* Ext4 move extent supports only extent based file */ ··· 494 1132 } 495 1133 496 1134 /* Start offset should be same */ 497 - if (orig_start != donor_start) { 1135 + if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) != 1136 + (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) { 498 1137 ext4_debug("ext4 move extent: orig and donor's start " 499 - "offset are not same [ino:orig %lu, donor %lu]\n", 1138 + "offset are not alligned [ino:orig %lu, donor %lu]\n", 500 1139 orig_inode->i_ino, donor_inode->i_ino); 501 1140 return -EINVAL; 502 1141 } 503 1142 504 1143 if ((orig_start >= EXT_MAX_BLOCKS) || 1144 + (donor_start >= EXT_MAX_BLOCKS) || 505 1145 (*len > EXT_MAX_BLOCKS) || 1146 + (donor_start + *len >= EXT_MAX_BLOCKS) || 506 1147 (orig_start + *len >= EXT_MAX_BLOCKS)) { 507 1148 ext4_debug("ext4 move extent: Can't handle over [%u] blocks " 508 1149 "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS, 509 1150 orig_inode->i_ino, donor_inode->i_ino); 510 1151 return -EINVAL; 511 1152 } 512 - 513 - if (orig_inode->i_size > donor_inode->i_size) { 514 - donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits; 515 - /* TODO: eliminate this artificial restriction */ 516 - if (orig_start >= donor_blocks) { 517 - ext4_debug("ext4 move extent: orig start offset " 518 - "[%llu] should be less than donor file blocks " 519 - "[%u] [ino:orig %lu, donor %lu]\n", 520 - orig_start, donor_blocks, 521 - orig_inode->i_ino, donor_inode->i_ino); 522 - return -EINVAL; 523 - } 524 - 525 - /* TODO: eliminate this artificial restriction */ 526 - if (orig_start + *len > donor_blocks) { 527 - ext4_debug("ext4 move extent: End offset [%llu] should " 528 - "be less than donor file blocks [%u]." 529 - "So adjust length from %llu to %llu " 530 - "[ino:orig %lu, donor %lu]\n", 531 - orig_start + *len, donor_blocks, 532 - *len, donor_blocks - orig_start, 533 - orig_inode->i_ino, donor_inode->i_ino); 534 - *len = donor_blocks - orig_start; 535 - } 536 - } else { 537 - orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits; 538 - if (orig_start >= orig_blocks) { 539 - ext4_debug("ext4 move extent: start offset [%llu] " 540 - "should be less than original file blocks " 541 - "[%u] [ino:orig %lu, donor %lu]\n", 542 - orig_start, orig_blocks, 543 - orig_inode->i_ino, donor_inode->i_ino); 544 - return -EINVAL; 545 - } 546 - 547 - if (orig_start + *len > orig_blocks) { 548 - ext4_debug("ext4 move extent: Adjust length " 549 - "from %llu to %llu. Because it should be " 550 - "less than original file blocks " 551 - "[ino:orig %lu, donor %lu]\n", 552 - *len, orig_blocks - orig_start, 553 - orig_inode->i_ino, donor_inode->i_ino); 554 - *len = orig_blocks - orig_start; 555 - } 556 - } 557 - 1153 + if (orig_eof < orig_start + *len - 1) 1154 + *len = orig_eof - orig_start; 1155 + if (donor_eof < donor_start + *len - 1) 1156 + *len = donor_eof - donor_start; 558 1157 if (!*len) { 559 1158 ext4_debug("ext4 move extent: len should not be 0 " 560 1159 "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, ··· 531 1208 * 532 1209 * @o_filp: file structure of the original file 533 1210 * @d_filp: file structure of the donor file 534 - * @orig_start: start offset in block for orig 535 - * @donor_start: start offset in block for donor 1211 + * @orig_blk: start offset in block for orig 1212 + * @donor_blk: start offset in block for donor 536 1213 * @len: the number of blocks to be moved 537 1214 * @moved_len: moved block length 538 1215 * 539 1216 * This function returns 0 and moved block length is set in moved_len 540 1217 * if succeed, otherwise returns error value. 541 1218 * 542 - * Note: ext4_move_extents() proceeds the following order. 543 - * 1:ext4_move_extents() calculates the last block number of moving extent 544 - * function by the start block number (orig_start) and the number of blocks 545 - * to be moved (len) specified as arguments. 546 - * If the {orig, donor}_start points a hole, the extent's start offset 547 - * pointed by ext_cur (current extent), holecheck_path, orig_path are set 548 - * after hole behind. 549 - * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent 550 - * or the ext_cur exceeds the block_end which is last logical block number. 551 - * 3:To get the length of continues area, call mext_next_extent() 552 - * specified with the ext_cur (initial value is holecheck_path) re-cursive, 553 - * until find un-continuous extent, the start logical block number exceeds 554 - * the block_end or the extent points to the last extent. 555 - * 4:Exchange the original inode data with donor inode data 556 - * from orig_page_offset to seq_end_page. 557 - * The start indexes of data are specified as arguments. 558 - * That of the original inode is orig_page_offset, 559 - * and the donor inode is also orig_page_offset 560 - * (To easily handle blocksize != pagesize case, the offset for the 561 - * donor inode is block unit). 562 - * 5:Update holecheck_path and orig_path to points a next proceeding extent, 563 - * then returns to step 2. 564 - * 6:Release holecheck_path, orig_path and set the len to moved_len 565 - * which shows the number of moved blocks. 566 - * The moved_len is useful for the command to calculate the file offset 567 - * for starting next move extent ioctl. 568 - * 7:Return 0 on success, or a negative error value on failure. 569 1219 */ 570 1220 int 571 - ext4_move_extents(struct file *o_filp, struct file *d_filp, 572 - __u64 orig_start, __u64 donor_start, __u64 len, 573 - __u64 *moved_len) 1221 + ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, 1222 + __u64 donor_blk, __u64 len, __u64 *moved_len) 574 1223 { 575 1224 struct inode *orig_inode = file_inode(o_filp); 576 1225 struct inode *donor_inode = file_inode(d_filp); 577 - struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL; 578 - struct ext4_extent *ext_prev, *ext_cur, *ext_dummy; 579 - ext4_lblk_t block_start = orig_start; 580 - ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; 581 - ext4_lblk_t rest_blocks; 582 - pgoff_t orig_page_offset = 0, seq_end_page; 583 - int ret, depth, last_extent = 0; 1226 + struct ext4_ext_path *path = NULL; 584 1227 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; 585 - int data_offset_in_page; 586 - int block_len_in_page; 587 - int unwritten; 1228 + ext4_lblk_t o_end, o_start = orig_blk; 1229 + ext4_lblk_t d_start = donor_blk; 1230 + int ret; 588 1231 589 1232 if (orig_inode->i_sb != donor_inode->i_sb) { 590 1233 ext4_debug("ext4 move extent: The argument files " ··· 592 1303 /* Protect extent tree against block allocations via delalloc */ 593 1304 ext4_double_down_write_data_sem(orig_inode, donor_inode); 594 1305 /* Check the filesystem environment whether move_extent can be done */ 595 - ret = mext_check_arguments(orig_inode, donor_inode, orig_start, 596 - donor_start, &len); 1306 + ret = mext_check_arguments(orig_inode, donor_inode, orig_blk, 1307 + donor_blk, &len); 597 1308 if (ret) 598 1309 goto out; 1310 + o_end = o_start + len; 599 1311 600 - file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; 601 - block_end = block_start + len - 1; 602 - if (file_end < block_end) 603 - len -= block_end - file_end; 1312 + while (o_start < o_end) { 1313 + struct ext4_extent *ex; 1314 + ext4_lblk_t cur_blk, next_blk; 1315 + pgoff_t orig_page_index, donor_page_index; 1316 + int offset_in_page; 1317 + int unwritten, cur_len; 604 1318 605 - ret = get_ext_path(orig_inode, block_start, &orig_path); 606 - if (ret) 607 - goto out; 608 - 609 - /* Get path structure to check the hole */ 610 - ret = get_ext_path(orig_inode, block_start, &holecheck_path); 611 - if (ret) 612 - goto out; 613 - 614 - depth = ext_depth(orig_inode); 615 - ext_cur = holecheck_path[depth].p_ext; 616 - 617 - /* 618 - * Get proper starting location of block replacement if block_start was 619 - * within the hole. 620 - */ 621 - if (le32_to_cpu(ext_cur->ee_block) + 622 - ext4_ext_get_actual_len(ext_cur) - 1 < block_start) { 623 - /* 624 - * The hole exists between extents or the tail of 625 - * original file. 626 - */ 627 - last_extent = mext_next_extent(orig_inode, 628 - holecheck_path, &ext_cur); 629 - if (last_extent < 0) { 630 - ret = last_extent; 1319 + ret = get_ext_path(orig_inode, o_start, &path); 1320 + if (ret) 631 1321 goto out; 632 - } 633 - last_extent = mext_next_extent(orig_inode, orig_path, 634 - &ext_dummy); 635 - if (last_extent < 0) { 636 - ret = last_extent; 637 - goto out; 638 - } 639 - seq_start = le32_to_cpu(ext_cur->ee_block); 640 - } else if (le32_to_cpu(ext_cur->ee_block) > block_start) 641 - /* The hole exists at the beginning of original file. */ 642 - seq_start = le32_to_cpu(ext_cur->ee_block); 643 - else 644 - seq_start = block_start; 645 - 646 - /* No blocks within the specified range. */ 647 - if (le32_to_cpu(ext_cur->ee_block) > block_end) { 648 - ext4_debug("ext4 move extent: The specified range of file " 649 - "may be the hole\n"); 650 - ret = -EINVAL; 651 - goto out; 652 - } 653 - 654 - /* Adjust start blocks */ 655 - add_blocks = min(le32_to_cpu(ext_cur->ee_block) + 656 - ext4_ext_get_actual_len(ext_cur), block_end + 1) - 657 - max(le32_to_cpu(ext_cur->ee_block), block_start); 658 - 659 - while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) { 660 - seq_blocks += add_blocks; 661 - 662 - /* Adjust tail blocks */ 663 - if (seq_start + seq_blocks - 1 > block_end) 664 - seq_blocks = block_end - seq_start + 1; 665 - 666 - ext_prev = ext_cur; 667 - last_extent = mext_next_extent(orig_inode, holecheck_path, 668 - &ext_cur); 669 - if (last_extent < 0) { 670 - ret = last_extent; 671 - break; 672 - } 673 - add_blocks = ext4_ext_get_actual_len(ext_cur); 674 - 675 - /* 676 - * Extend the length of contiguous block (seq_blocks) 677 - * if extents are contiguous. 678 - */ 679 - if (ext4_can_extents_be_merged(orig_inode, 680 - ext_prev, ext_cur) && 681 - block_end >= le32_to_cpu(ext_cur->ee_block) && 682 - !last_extent) 1322 + ex = path[path->p_depth].p_ext; 1323 + next_blk = ext4_ext_next_allocated_block(path); 1324 + cur_blk = le32_to_cpu(ex->ee_block); 1325 + cur_len = ext4_ext_get_actual_len(ex); 1326 + /* Check hole before the start pos */ 1327 + if (cur_blk + cur_len - 1 < o_start) { 1328 + if (next_blk == EXT_MAX_BLOCKS) { 1329 + o_start = o_end; 1330 + ret = -ENODATA; 1331 + goto out; 1332 + } 1333 + d_start += next_blk - o_start; 1334 + o_start = next_blk; 683 1335 continue; 684 - 685 - /* Is original extent is unwritten */ 686 - unwritten = ext4_ext_is_unwritten(ext_prev); 687 - 688 - data_offset_in_page = seq_start % blocks_per_page; 689 - 690 - /* 691 - * Calculate data blocks count that should be swapped 692 - * at the first page. 693 - */ 694 - if (data_offset_in_page + seq_blocks > blocks_per_page) { 695 - /* Swapped blocks are across pages */ 696 - block_len_in_page = 697 - blocks_per_page - data_offset_in_page; 698 - } else { 699 - /* Swapped blocks are in a page */ 700 - block_len_in_page = seq_blocks; 1336 + /* Check hole after the start pos */ 1337 + } else if (cur_blk > o_start) { 1338 + /* Skip hole */ 1339 + d_start += cur_blk - o_start; 1340 + o_start = cur_blk; 1341 + /* Extent inside requested range ?*/ 1342 + if (cur_blk >= o_end) 1343 + goto out; 1344 + } else { /* in_range(o_start, o_blk, o_len) */ 1345 + cur_len += cur_blk - o_start; 701 1346 } 1347 + unwritten = ext4_ext_is_unwritten(ex); 1348 + if (o_end - o_start < cur_len) 1349 + cur_len = o_end - o_start; 702 1350 703 - orig_page_offset = seq_start >> 704 - (PAGE_CACHE_SHIFT - orig_inode->i_blkbits); 705 - seq_end_page = (seq_start + seq_blocks - 1) >> 706 - (PAGE_CACHE_SHIFT - orig_inode->i_blkbits); 707 - seq_start = le32_to_cpu(ext_cur->ee_block); 708 - rest_blocks = seq_blocks; 709 - 1351 + orig_page_index = o_start >> (PAGE_CACHE_SHIFT - 1352 + orig_inode->i_blkbits); 1353 + donor_page_index = d_start >> (PAGE_CACHE_SHIFT - 1354 + donor_inode->i_blkbits); 1355 + offset_in_page = o_start % blocks_per_page; 1356 + if (cur_len > blocks_per_page- offset_in_page) 1357 + cur_len = blocks_per_page - offset_in_page; 710 1358 /* 711 1359 * Up semaphore to avoid following problems: 712 1360 * a. transaction deadlock among ext4_journal_start, ··· 652 1426 * in move_extent_per_page 653 1427 */ 654 1428 ext4_double_up_write_data_sem(orig_inode, donor_inode); 655 - 656 - while (orig_page_offset <= seq_end_page) { 657 - 658 - /* Swap original branches with new branches */ 659 - block_len_in_page = move_extent_per_page( 660 - o_filp, donor_inode, 661 - orig_page_offset, 662 - data_offset_in_page, 663 - block_len_in_page, 664 - unwritten, &ret); 665 - 666 - /* Count how many blocks we have exchanged */ 667 - *moved_len += block_len_in_page; 668 - if (ret < 0) 669 - break; 670 - if (*moved_len > len) { 671 - EXT4_ERROR_INODE(orig_inode, 672 - "We replaced blocks too much! " 673 - "sum of replaced: %llu requested: %llu", 674 - *moved_len, len); 675 - ret = -EIO; 676 - break; 677 - } 678 - 679 - orig_page_offset++; 680 - data_offset_in_page = 0; 681 - rest_blocks -= block_len_in_page; 682 - if (rest_blocks > blocks_per_page) 683 - block_len_in_page = blocks_per_page; 684 - else 685 - block_len_in_page = rest_blocks; 686 - } 687 - 1429 + /* Swap original branches with new branches */ 1430 + move_extent_per_page(o_filp, donor_inode, 1431 + orig_page_index, donor_page_index, 1432 + offset_in_page, cur_len, 1433 + unwritten, &ret); 688 1434 ext4_double_down_write_data_sem(orig_inode, donor_inode); 689 1435 if (ret < 0) 690 1436 break; 691 - 692 - /* Decrease buffer counter */ 693 - if (holecheck_path) 694 - ext4_ext_drop_refs(holecheck_path); 695 - ret = get_ext_path(orig_inode, seq_start, &holecheck_path); 696 - if (ret) 697 - break; 698 - depth = holecheck_path->p_depth; 699 - 700 - /* Decrease buffer counter */ 701 - if (orig_path) 702 - ext4_ext_drop_refs(orig_path); 703 - ret = get_ext_path(orig_inode, seq_start, &orig_path); 704 - if (ret) 705 - break; 706 - 707 - ext_cur = holecheck_path[depth].p_ext; 708 - add_blocks = ext4_ext_get_actual_len(ext_cur); 709 - seq_blocks = 0; 710 - 1437 + o_start += cur_len; 1438 + d_start += cur_len; 711 1439 } 1440 + *moved_len = o_start - orig_blk; 1441 + if (*moved_len > len) 1442 + *moved_len = len; 1443 + 712 1444 out: 713 1445 if (*moved_len) { 714 1446 ext4_discard_preallocations(orig_inode); 715 1447 ext4_discard_preallocations(donor_inode); 716 1448 } 717 1449 718 - if (orig_path) { 719 - ext4_ext_drop_refs(orig_path); 720 - kfree(orig_path); 721 - } 722 - if (holecheck_path) { 723 - ext4_ext_drop_refs(holecheck_path); 724 - kfree(holecheck_path); 725 - } 1450 + ext4_ext_drop_refs(path); 1451 + kfree(path); 726 1452 ext4_double_up_write_data_sem(orig_inode, donor_inode); 727 1453 ext4_inode_resume_unlocked_dio(orig_inode); 728 1454 ext4_inode_resume_unlocked_dio(donor_inode);

+99 -141

fs/ext4/namei.c

··· 53 53 ext4_lblk_t *block) 54 54 { 55 55 struct buffer_head *bh; 56 - int err = 0; 56 + int err; 57 57 58 58 if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && 59 59 ((inode->i_size >> 10) >= ··· 62 62 63 63 *block = inode->i_size >> inode->i_sb->s_blocksize_bits; 64 64 65 - bh = ext4_bread(handle, inode, *block, 1, &err); 66 - if (!bh) 67 - return ERR_PTR(err); 65 + bh = ext4_bread(handle, inode, *block, 1); 66 + if (IS_ERR(bh)) 67 + return bh; 68 68 inode->i_size += inode->i_sb->s_blocksize; 69 69 EXT4_I(inode)->i_disksize = inode->i_size; 70 70 BUFFER_TRACE(bh, "get_write_access"); ··· 94 94 { 95 95 struct buffer_head *bh; 96 96 struct ext4_dir_entry *dirent; 97 - int err = 0, is_dx_block = 0; 97 + int is_dx_block = 0; 98 98 99 - bh = ext4_bread(NULL, inode, block, 0, &err); 100 - if (!bh) { 101 - if (err == 0) { 102 - ext4_error_inode(inode, __func__, line, block, 103 - "Directory hole found"); 104 - return ERR_PTR(-EIO); 105 - } 99 + bh = ext4_bread(NULL, inode, block, 0); 100 + if (IS_ERR(bh)) { 106 101 __ext4_warning(inode->i_sb, __func__, line, 107 - "error reading directory block " 108 - "(ino %lu, block %lu)", inode->i_ino, 102 + "error %ld reading directory block " 103 + "(ino %lu, block %lu)", PTR_ERR(bh), inode->i_ino, 109 104 (unsigned long) block); 110 - return ERR_PTR(err); 105 + 106 + return bh; 107 + } 108 + if (!bh) { 109 + ext4_error_inode(inode, __func__, line, block, "Directory hole found"); 110 + return ERR_PTR(-EIO); 111 111 } 112 112 dirent = (struct ext4_dir_entry *) bh->b_data; 113 113 /* Determine whether or not we have an index block */ ··· 124 124 "directory leaf block found instead of index block"); 125 125 return ERR_PTR(-EIO); 126 126 } 127 - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 128 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) || 127 + if (!ext4_has_metadata_csum(inode->i_sb) || 129 128 buffer_verified(bh)) 130 129 return bh; 131 130 ··· 252 253 static struct dx_frame *dx_probe(const struct qstr *d_name, 253 254 struct inode *dir, 254 255 struct dx_hash_info *hinfo, 255 - struct dx_frame *frame, 256 - int *err); 256 + struct dx_frame *frame); 257 257 static void dx_release(struct dx_frame *frames); 258 258 static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, 259 259 struct dx_hash_info *hinfo, struct dx_map_entry map[]); ··· 268 270 __u32 *start_hash); 269 271 static struct buffer_head * ext4_dx_find_entry(struct inode *dir, 270 272 const struct qstr *d_name, 271 - struct ext4_dir_entry_2 **res_dir, 272 - int *err); 273 + struct ext4_dir_entry_2 **res_dir); 273 274 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, 274 275 struct inode *inode); 275 276 ··· 337 340 { 338 341 struct ext4_dir_entry_tail *t; 339 342 340 - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 341 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 343 + if (!ext4_has_metadata_csum(inode->i_sb)) 342 344 return 1; 343 345 344 346 t = get_dirent_tail(inode, dirent); ··· 358 362 { 359 363 struct ext4_dir_entry_tail *t; 360 364 361 - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 362 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 365 + if (!ext4_has_metadata_csum(inode->i_sb)) 363 366 return; 364 367 365 368 t = get_dirent_tail(inode, dirent); ··· 433 438 struct dx_tail *t; 434 439 int count_offset, limit, count; 435 440 436 - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 437 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 441 + if (!ext4_has_metadata_csum(inode->i_sb)) 438 442 return 1; 439 443 440 444 c = get_dx_countlimit(inode, dirent, &count_offset); ··· 462 468 struct dx_tail *t; 463 469 int count_offset, limit, count; 464 470 465 - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 466 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 471 + if (!ext4_has_metadata_csum(inode->i_sb)) 467 472 return; 468 473 469 474 c = get_dx_countlimit(inode, dirent, &count_offset); ··· 550 557 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - 551 558 EXT4_DIR_REC_LEN(2) - infosize; 552 559 553 - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, 554 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 560 + if (ext4_has_metadata_csum(dir->i_sb)) 555 561 entry_space -= sizeof(struct dx_tail); 556 562 return entry_space / sizeof(struct dx_entry); 557 563 } ··· 559 567 { 560 568 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); 561 569 562 - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, 563 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 570 + if (ext4_has_metadata_csum(dir->i_sb)) 564 571 entry_space -= sizeof(struct dx_tail); 565 572 return entry_space / sizeof(struct dx_entry); 566 573 } ··· 632 641 u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; 633 642 struct stats stats; 634 643 printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); 635 - if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue; 644 + bh = ext4_bread(NULL,dir, block, 0); 645 + if (!bh || IS_ERR(bh)) 646 + continue; 636 647 stats = levels? 637 648 dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): 638 649 dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0); ··· 662 669 */ 663 670 static struct dx_frame * 664 671 dx_probe(const struct qstr *d_name, struct inode *dir, 665 - struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) 672 + struct dx_hash_info *hinfo, struct dx_frame *frame_in) 666 673 { 667 674 unsigned count, indirect; 668 675 struct dx_entry *at, *entries, *p, *q, *m; 669 676 struct dx_root *root; 670 - struct buffer_head *bh; 671 677 struct dx_frame *frame = frame_in; 678 + struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); 672 679 u32 hash; 673 680 674 - frame->bh = NULL; 675 - bh = ext4_read_dirblock(dir, 0, INDEX); 676 - if (IS_ERR(bh)) { 677 - *err = PTR_ERR(bh); 678 - goto fail; 679 - } 680 - root = (struct dx_root *) bh->b_data; 681 + frame->bh = ext4_read_dirblock(dir, 0, INDEX); 682 + if (IS_ERR(frame->bh)) 683 + return (struct dx_frame *) frame->bh; 684 + 685 + root = (struct dx_root *) frame->bh->b_data; 681 686 if (root->info.hash_version != DX_HASH_TEA && 682 687 root->info.hash_version != DX_HASH_HALF_MD4 && 683 688 root->info.hash_version != DX_HASH_LEGACY) { 684 689 ext4_warning(dir->i_sb, "Unrecognised inode hash code %d", 685 690 root->info.hash_version); 686 - brelse(bh); 687 - *err = ERR_BAD_DX_DIR; 688 691 goto fail; 689 692 } 690 693 hinfo->hash_version = root->info.hash_version; ··· 694 705 if (root->info.unused_flags & 1) { 695 706 ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x", 696 707 root->info.unused_flags); 697 - brelse(bh); 698 - *err = ERR_BAD_DX_DIR; 699 708 goto fail; 700 709 } 701 710 702 711 if ((indirect = root->info.indirect_levels) > 1) { 703 712 ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", 704 713 root->info.indirect_levels); 705 - brelse(bh); 706 - *err = ERR_BAD_DX_DIR; 707 714 goto fail; 708 715 } 709 716 ··· 709 724 if (dx_get_limit(entries) != dx_root_limit(dir, 710 725 root->info.info_length)) { 711 726 ext4_warning(dir->i_sb, "dx entry: limit != root limit"); 712 - brelse(bh); 713 - *err = ERR_BAD_DX_DIR; 714 727 goto fail; 715 728 } 716 729 717 730 dxtrace(printk("Look up %x", hash)); 718 - while (1) 719 - { 731 + while (1) { 720 732 count = dx_get_count(entries); 721 733 if (!count || count > dx_get_limit(entries)) { 722 734 ext4_warning(dir->i_sb, 723 735 "dx entry: no count or count > limit"); 724 - brelse(bh); 725 - *err = ERR_BAD_DX_DIR; 726 - goto fail2; 736 + goto fail; 727 737 } 728 738 729 739 p = entries + 1; 730 740 q = entries + count - 1; 731 - while (p <= q) 732 - { 741 + while (p <= q) { 733 742 m = p + (q - p)/2; 734 743 dxtrace(printk(".")); 735 744 if (dx_get_hash(m) > hash) ··· 732 753 p = m + 1; 733 754 } 734 755 735 - if (0) // linear search cross check 736 - { 756 + if (0) { // linear search cross check 737 757 unsigned n = count - 1; 738 758 at = entries; 739 759 while (n--) ··· 749 771 750 772 at = p - 1; 751 773 dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); 752 - frame->bh = bh; 753 774 frame->entries = entries; 754 775 frame->at = at; 755 - if (!indirect--) return frame; 756 - bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX); 757 - if (IS_ERR(bh)) { 758 - *err = PTR_ERR(bh); 759 - goto fail2; 776 + if (!indirect--) 777 + return frame; 778 + frame++; 779 + frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX); 780 + if (IS_ERR(frame->bh)) { 781 + ret_err = (struct dx_frame *) frame->bh; 782 + frame->bh = NULL; 783 + goto fail; 760 784 } 761 - entries = ((struct dx_node *) bh->b_data)->entries; 785 + entries = ((struct dx_node *) frame->bh->b_data)->entries; 762 786 763 787 if (dx_get_limit(entries) != dx_node_limit (dir)) { 764 788 ext4_warning(dir->i_sb, 765 789 "dx entry: limit != node limit"); 766 - brelse(bh); 767 - *err = ERR_BAD_DX_DIR; 768 - goto fail2; 790 + goto fail; 769 791 } 770 - frame++; 771 - frame->bh = NULL; 772 792 } 773 - fail2: 793 + fail: 774 794 while (frame >= frame_in) { 775 795 brelse(frame->bh); 776 796 frame--; 777 797 } 778 - fail: 779 - if (*err == ERR_BAD_DX_DIR) 798 + if (ret_err == ERR_PTR(ERR_BAD_DX_DIR)) 780 799 ext4_warning(dir->i_sb, 781 800 "Corrupt dir inode %lu, running e2fsck is " 782 801 "recommended.", dir->i_ino); 783 - return NULL; 802 + return ret_err; 784 803 } 785 804 786 805 static void dx_release (struct dx_frame *frames) ··· 963 988 } 964 989 hinfo.hash = start_hash; 965 990 hinfo.minor_hash = 0; 966 - frame = dx_probe(NULL, dir, &hinfo, frames, &err); 967 - if (!frame) 968 - return err; 991 + frame = dx_probe(NULL, dir, &hinfo, frames); 992 + if (IS_ERR(frame)) 993 + return PTR_ERR(frame); 969 994 970 995 /* Add '.' and '..' from the htree header */ 971 996 if (!start_hash && !start_minor_hash) { ··· 1202 1227 buffer */ 1203 1228 int num = 0; 1204 1229 ext4_lblk_t nblocks; 1205 - int i, err = 0; 1206 - int namelen; 1230 + int i, namelen; 1207 1231 1208 1232 *res_dir = NULL; 1209 1233 sb = dir->i_sb; ··· 1232 1258 goto restart; 1233 1259 } 1234 1260 if (is_dx(dir)) { 1235 - bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); 1261 + bh = ext4_dx_find_entry(dir, d_name, res_dir); 1236 1262 /* 1237 1263 * On success, or if the error was file not found, 1238 1264 * return. Otherwise, fall back to doing a search the 1239 1265 * old fashioned way. 1240 1266 */ 1241 - if (err == -ENOENT) 1242 - return NULL; 1243 - if (err && err != ERR_BAD_DX_DIR) 1244 - return ERR_PTR(err); 1245 - if (bh) 1267 + if (!IS_ERR(bh) || PTR_ERR(bh) != ERR_BAD_DX_DIR) 1246 1268 return bh; 1247 1269 dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " 1248 1270 "falling back\n")); ··· 1268 1298 break; 1269 1299 } 1270 1300 num++; 1271 - bh = ext4_getblk(NULL, dir, b++, 0, &err); 1272 - if (unlikely(err)) { 1301 + bh = ext4_getblk(NULL, dir, b++, 0); 1302 + if (unlikely(IS_ERR(bh))) { 1273 1303 if (ra_max == 0) 1274 - return ERR_PTR(err); 1304 + return bh; 1275 1305 break; 1276 1306 } 1277 1307 bh_use[ra_max] = bh; ··· 1336 1366 } 1337 1367 1338 1368 static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, 1339 - struct ext4_dir_entry_2 **res_dir, int *err) 1369 + struct ext4_dir_entry_2 **res_dir) 1340 1370 { 1341 1371 struct super_block * sb = dir->i_sb; 1342 1372 struct dx_hash_info hinfo; ··· 1345 1375 ext4_lblk_t block; 1346 1376 int retval; 1347 1377 1348 - if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) 1349 - return NULL; 1378 + frame = dx_probe(d_name, dir, &hinfo, frames); 1379 + if (IS_ERR(frame)) 1380 + return (struct buffer_head *) frame; 1350 1381 do { 1351 1382 block = dx_get_block(frame->at); 1352 1383 bh = ext4_read_dirblock(dir, block, DIRENT); 1353 - if (IS_ERR(bh)) { 1354 - *err = PTR_ERR(bh); 1384 + if (IS_ERR(bh)) 1355 1385 goto errout; 1356 - } 1386 + 1357 1387 retval = search_dirblock(bh, dir, d_name, 1358 1388 block << EXT4_BLOCK_SIZE_BITS(sb), 1359 1389 res_dir); 1360 - if (retval == 1) { /* Success! */ 1361 - dx_release(frames); 1362 - return bh; 1363 - } 1390 + if (retval == 1) 1391 + goto success; 1364 1392 brelse(bh); 1365 1393 if (retval == -1) { 1366 - *err = ERR_BAD_DX_DIR; 1394 + bh = ERR_PTR(ERR_BAD_DX_DIR); 1367 1395 goto errout; 1368 1396 } 1369 1397 ··· 1370 1402 frames, NULL); 1371 1403 if (retval < 0) { 1372 1404 ext4_warning(sb, 1373 - "error reading index page in directory #%lu", 1374 - dir->i_ino); 1375 - *err = retval; 1405 + "error %d reading index page in directory #%lu", 1406 + retval, dir->i_ino); 1407 + bh = ERR_PTR(retval); 1376 1408 goto errout; 1377 1409 } 1378 1410 } while (retval == 1); 1379 1411 1380 - *err = -ENOENT; 1412 + bh = NULL; 1381 1413 errout: 1382 1414 dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name)); 1383 - dx_release (frames); 1384 - return NULL; 1415 + success: 1416 + dx_release(frames); 1417 + return bh; 1385 1418 } 1386 1419 1387 1420 static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) ··· 1410 1441 dentry); 1411 1442 return ERR_PTR(-EIO); 1412 1443 } 1413 - inode = ext4_iget(dir->i_sb, ino); 1444 + inode = ext4_iget_normal(dir->i_sb, ino); 1414 1445 if (inode == ERR_PTR(-ESTALE)) { 1415 1446 EXT4_ERROR_INODE(dir, 1416 1447 "deleted inode referenced: %u", ··· 1443 1474 return ERR_PTR(-EIO); 1444 1475 } 1445 1476 1446 - return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino)); 1477 + return d_obtain_alias(ext4_iget_normal(child->d_inode->i_sb, ino)); 1447 1478 } 1448 1479 1449 1480 /* ··· 1502 1533 */ 1503 1534 static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, 1504 1535 struct buffer_head **bh,struct dx_frame *frame, 1505 - struct dx_hash_info *hinfo, int *error) 1536 + struct dx_hash_info *hinfo) 1506 1537 { 1507 1538 unsigned blocksize = dir->i_sb->s_blocksize; 1508 1539 unsigned count, continued; ··· 1517 1548 int csum_size = 0; 1518 1549 int err = 0, i; 1519 1550 1520 - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, 1521 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 1551 + if (ext4_has_metadata_csum(dir->i_sb)) 1522 1552 csum_size = sizeof(struct ext4_dir_entry_tail); 1523 1553 1524 1554 bh2 = ext4_append(handle, dir, &newblock); 1525 1555 if (IS_ERR(bh2)) { 1526 1556 brelse(*bh); 1527 1557 *bh = NULL; 1528 - *error = PTR_ERR(bh2); 1529 - return NULL; 1558 + return (struct ext4_dir_entry_2 *) bh2; 1530 1559 } 1531 1560 1532 1561 BUFFER_TRACE(*bh, "get_write_access"); ··· 1584 1617 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); 1585 1618 1586 1619 /* Which block gets the new entry? */ 1587 - if (hinfo->hash >= hash2) 1588 - { 1620 + if (hinfo->hash >= hash2) { 1589 1621 swap(*bh, bh2); 1590 1622 de = de2; 1591 1623 } ··· 1604 1638 brelse(bh2); 1605 1639 *bh = NULL; 1606 1640 ext4_std_error(dir->i_sb, err); 1607 - *error = err; 1608 - return NULL; 1641 + return ERR_PTR(err); 1609 1642 } 1610 1643 1611 1644 int ext4_find_dest_de(struct inode *dir, struct inode *inode, ··· 1683 1718 int csum_size = 0; 1684 1719 int err; 1685 1720 1686 - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 1687 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 1721 + if (ext4_has_metadata_csum(inode->i_sb)) 1688 1722 csum_size = sizeof(struct ext4_dir_entry_tail); 1689 1723 1690 1724 if (!de) { ··· 1750 1786 struct fake_dirent *fde; 1751 1787 int csum_size = 0; 1752 1788 1753 - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 1754 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 1789 + if (ext4_has_metadata_csum(inode->i_sb)) 1755 1790 csum_size = sizeof(struct ext4_dir_entry_tail); 1756 1791 1757 1792 blocksize = dir->i_sb->s_blocksize; ··· 1825 1862 ext4_handle_dirty_dx_node(handle, dir, frame->bh); 1826 1863 ext4_handle_dirty_dirent_node(handle, dir, bh); 1827 1864 1828 - de = do_split(handle,dir, &bh, frame, &hinfo, &retval); 1829 - if (!de) { 1865 + de = do_split(handle,dir, &bh, frame, &hinfo); 1866 + if (IS_ERR(de)) { 1830 1867 /* 1831 1868 * Even if the block split failed, we have to properly write 1832 1869 * out all the changes we did so far. Otherwise we can end up ··· 1834 1871 */ 1835 1872 ext4_mark_inode_dirty(handle, dir); 1836 1873 dx_release(frames); 1837 - return retval; 1874 + return PTR_ERR(de); 1838 1875 } 1839 1876 dx_release(frames); 1840 1877 ··· 1867 1904 ext4_lblk_t block, blocks; 1868 1905 int csum_size = 0; 1869 1906 1870 - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 1871 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 1907 + if (ext4_has_metadata_csum(inode->i_sb)) 1872 1908 csum_size = sizeof(struct ext4_dir_entry_tail); 1873 1909 1874 1910 sb = dir->i_sb; ··· 1944 1982 struct ext4_dir_entry_2 *de; 1945 1983 int err; 1946 1984 1947 - frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); 1948 - if (!frame) 1949 - return err; 1985 + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames); 1986 + if (IS_ERR(frame)) 1987 + return PTR_ERR(frame); 1950 1988 entries = frame->entries; 1951 1989 at = frame->at; 1952 1990 bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT); ··· 2057 2095 goto cleanup; 2058 2096 } 2059 2097 } 2060 - de = do_split(handle, dir, &bh, frame, &hinfo, &err); 2061 - if (!de) 2098 + de = do_split(handle, dir, &bh, frame, &hinfo); 2099 + if (IS_ERR(de)) { 2100 + err = PTR_ERR(de); 2062 2101 goto cleanup; 2102 + } 2063 2103 err = add_dirent_to_buf(handle, dentry, inode, de, bh); 2064 2104 goto cleanup; 2065 2105 ··· 2131 2167 return err; 2132 2168 } 2133 2169 2134 - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, 2135 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 2170 + if (ext4_has_metadata_csum(dir->i_sb)) 2136 2171 csum_size = sizeof(struct ext4_dir_entry_tail); 2137 2172 2138 2173 BUFFER_TRACE(bh, "get_write_access"); ··· 2350 2387 int csum_size = 0; 2351 2388 int err; 2352 2389 2353 - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, 2354 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 2390 + if (ext4_has_metadata_csum(dir->i_sb)) 2355 2391 csum_size = sizeof(struct ext4_dir_entry_tail); 2356 2392 2357 2393 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ··· 2365 2403 dir_block = ext4_append(handle, inode, &block); 2366 2404 if (IS_ERR(dir_block)) 2367 2405 return PTR_ERR(dir_block); 2368 - BUFFER_TRACE(dir_block, "get_write_access"); 2369 - err = ext4_journal_get_write_access(handle, dir_block); 2370 - if (err) 2371 - goto out; 2372 2406 de = (struct ext4_dir_entry_2 *)dir_block->b_data; 2373 2407 ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); 2374 2408 set_nlink(inode, 2); ··· 2531 2573 int err = 0, rc; 2532 2574 bool dirty = false; 2533 2575 2534 - if (!sbi->s_journal) 2576 + if (!sbi->s_journal || is_bad_inode(inode)) 2535 2577 return 0; 2536 2578 2537 2579 WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&

+1 -2

fs/ext4/resize.c

··· 1212 1212 { 1213 1213 struct buffer_head *bh; 1214 1214 1215 - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 1216 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 1215 + if (!ext4_has_metadata_csum(sb)) 1217 1216 return 0; 1218 1217 1219 1218 bh = ext4_get_bitmap(sb, group_data->inode_bitmap);

+128 -117

fs/ext4/super.c

··· 70 70 static void ext4_clear_journal_err(struct super_block *sb, 71 71 struct ext4_super_block *es); 72 72 static int ext4_sync_fs(struct super_block *sb, int wait); 73 - static int ext4_sync_fs_nojournal(struct super_block *sb, int wait); 74 73 static int ext4_remount(struct super_block *sb, int *flags, char *data); 75 74 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); 76 75 static int ext4_unfreeze(struct super_block *sb); ··· 140 141 static int ext4_superblock_csum_verify(struct super_block *sb, 141 142 struct ext4_super_block *es) 142 143 { 143 - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 144 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 144 + if (!ext4_has_metadata_csum(sb)) 145 145 return 1; 146 146 147 147 return es->s_checksum == ext4_superblock_csum(sb, es); ··· 150 152 { 151 153 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 152 154 153 - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 154 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 155 + if (!ext4_has_metadata_csum(sb)) 155 156 return; 156 157 157 158 es->s_checksum = ext4_superblock_csum(sb, es); ··· 817 820 percpu_counter_destroy(&sbi->s_freeinodes_counter); 818 821 percpu_counter_destroy(&sbi->s_dirs_counter); 819 822 percpu_counter_destroy(&sbi->s_dirtyclusters_counter); 820 - percpu_counter_destroy(&sbi->s_extent_cache_cnt); 821 823 brelse(sbi->s_sbh); 822 824 #ifdef CONFIG_QUOTA 823 - for (i = 0; i < MAXQUOTAS; i++) 825 + for (i = 0; i < EXT4_MAXQUOTAS; i++) 824 826 kfree(sbi->s_qf_names[i]); 825 827 #endif 826 828 ··· 881 885 ext4_es_init_tree(&ei->i_es_tree); 882 886 rwlock_init(&ei->i_es_lock); 883 887 INIT_LIST_HEAD(&ei->i_es_lru); 888 + ei->i_es_all_nr = 0; 884 889 ei->i_es_lru_nr = 0; 885 890 ei->i_touch_when = 0; 886 891 ei->i_reserved_data_blocks = 0; ··· 999 1002 * Currently we don't know the generation for parent directory, so 1000 1003 * a generation of 0 means "accept any" 1001 1004 */ 1002 - inode = ext4_iget(sb, ino); 1005 + inode = ext4_iget_normal(sb, ino); 1003 1006 if (IS_ERR(inode)) 1004 1007 return ERR_CAST(inode); 1005 1008 if (generation && inode->i_generation != generation) { ··· 1111 1114 .sync_fs = ext4_sync_fs, 1112 1115 .freeze_fs = ext4_freeze, 1113 1116 .unfreeze_fs = ext4_unfreeze, 1114 - .statfs = ext4_statfs, 1115 - .remount_fs = ext4_remount, 1116 - .show_options = ext4_show_options, 1117 - #ifdef CONFIG_QUOTA 1118 - .quota_read = ext4_quota_read, 1119 - .quota_write = ext4_quota_write, 1120 - #endif 1121 - .bdev_try_to_free_page = bdev_try_to_free_page, 1122 - }; 1123 - 1124 - static const struct super_operations ext4_nojournal_sops = { 1125 - .alloc_inode = ext4_alloc_inode, 1126 - .destroy_inode = ext4_destroy_inode, 1127 - .write_inode = ext4_write_inode, 1128 - .dirty_inode = ext4_dirty_inode, 1129 - .drop_inode = ext4_drop_inode, 1130 - .evict_inode = ext4_evict_inode, 1131 - .sync_fs = ext4_sync_fs_nojournal, 1132 - .put_super = ext4_put_super, 1133 1117 .statfs = ext4_statfs, 1134 1118 .remount_fs = ext4_remount, 1135 1119 .show_options = ext4_show_options, ··· 1690 1712 "not specified"); 1691 1713 return 0; 1692 1714 } 1693 - } else { 1694 - if (sbi->s_jquota_fmt) { 1695 - ext4_msg(sb, KERN_ERR, "journaled quota format " 1696 - "specified with no journaling " 1697 - "enabled"); 1698 - return 0; 1699 - } 1700 1715 } 1701 1716 #endif 1702 1717 if (test_opt(sb, DIOREAD_NOLOCK)) { ··· 1987 2016 __u16 crc = 0; 1988 2017 __le32 le_group = cpu_to_le32(block_group); 1989 2018 1990 - if ((sbi->s_es->s_feature_ro_compat & 1991 - cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) { 2019 + if (ext4_has_metadata_csum(sbi->s_sb)) { 1992 2020 /* Use new metadata_csum algorithm */ 1993 2021 __le16 save_csum; 1994 2022 __u32 csum32; ··· 2005 2035 } 2006 2036 2007 2037 /* old crc16 code */ 2038 + if (!(sbi->s_es->s_feature_ro_compat & 2039 + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM))) 2040 + return 0; 2041 + 2008 2042 offset = offsetof(struct ext4_group_desc, bg_checksum); 2009 2043 2010 2044 crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); ··· 2165 2191 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { 2166 2192 /* don't clear list on RO mount w/ errors */ 2167 2193 if (es->s_last_orphan && !(s_flags & MS_RDONLY)) { 2168 - jbd_debug(1, "Errors on filesystem, " 2194 + ext4_msg(sb, KERN_INFO, "Errors on filesystem, " 2169 2195 "clearing orphan list.\n"); 2170 2196 es->s_last_orphan = 0; 2171 2197 } ··· 2181 2207 /* Needed for iput() to work correctly and not trash data */ 2182 2208 sb->s_flags |= MS_ACTIVE; 2183 2209 /* Turn on quotas so that they are updated correctly */ 2184 - for (i = 0; i < MAXQUOTAS; i++) { 2210 + for (i = 0; i < EXT4_MAXQUOTAS; i++) { 2185 2211 if (EXT4_SB(sb)->s_qf_names[i]) { 2186 2212 int ret = ext4_quota_on_mount(sb, i); 2187 2213 if (ret < 0) ··· 2237 2263 PLURAL(nr_truncates)); 2238 2264 #ifdef CONFIG_QUOTA 2239 2265 /* Turn quotas off */ 2240 - for (i = 0; i < MAXQUOTAS; i++) { 2266 + for (i = 0; i < EXT4_MAXQUOTAS; i++) { 2241 2267 if (sb_dqopt(sb)->files[i]) 2242 2268 dquot_quota_off(sb, i); 2243 2269 } ··· 2522 2548 return count; 2523 2549 } 2524 2550 2551 + static ssize_t es_ui_show(struct ext4_attr *a, 2552 + struct ext4_sb_info *sbi, char *buf) 2553 + { 2554 + 2555 + unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) + 2556 + a->u.offset); 2557 + 2558 + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); 2559 + } 2560 + 2525 2561 static ssize_t reserved_clusters_show(struct ext4_attr *a, 2526 2562 struct ext4_sb_info *sbi, char *buf) 2527 2563 { ··· 2585 2601 .offset = offsetof(struct ext4_sb_info, _elname),\ 2586 2602 }, \ 2587 2603 } 2604 + 2605 + #define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname) \ 2606 + static struct ext4_attr ext4_attr_##_name = { \ 2607 + .attr = {.name = __stringify(_name), .mode = _mode }, \ 2608 + .show = _show, \ 2609 + .store = _store, \ 2610 + .u = { \ 2611 + .offset = offsetof(struct ext4_super_block, _elname), \ 2612 + }, \ 2613 + } 2614 + 2588 2615 #define EXT4_ATTR(name, mode, show, store) \ 2589 2616 static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) 2590 2617 2591 2618 #define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL) 2592 2619 #define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL) 2593 2620 #define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store) 2621 + 2622 + #define EXT4_RO_ATTR_ES_UI(name, elname) \ 2623 + EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname) 2594 2624 #define EXT4_RW_ATTR_SBI_UI(name, elname) \ 2595 2625 EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) 2626 + 2596 2627 #define ATTR_LIST(name) &ext4_attr_##name.attr 2597 2628 #define EXT4_DEPRECATED_ATTR(_name, _val) \ 2598 2629 static struct ext4_attr ext4_attr_##_name = { \ ··· 2640 2641 EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); 2641 2642 EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); 2642 2643 EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); 2644 + EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); 2645 + EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time); 2646 + EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time); 2643 2647 2644 2648 static struct attribute *ext4_attrs[] = { 2645 2649 ATTR_LIST(delayed_allocation_blocks), ··· 2666 2664 ATTR_LIST(warning_ratelimit_burst), 2667 2665 ATTR_LIST(msg_ratelimit_interval_ms), 2668 2666 ATTR_LIST(msg_ratelimit_burst), 2667 + ATTR_LIST(errors_count), 2668 + ATTR_LIST(first_error_time), 2669 + ATTR_LIST(last_error_time), 2669 2670 NULL, 2670 2671 }; 2671 2672 ··· 2728 2723 complete(&ext4_feat->f_kobj_unregister); 2729 2724 } 2730 2725 2726 + static ssize_t ext4_feat_show(struct kobject *kobj, 2727 + struct attribute *attr, char *buf) 2728 + { 2729 + return snprintf(buf, PAGE_SIZE, "supported\n"); 2730 + } 2731 + 2732 + /* 2733 + * We can not use ext4_attr_show/store because it relies on the kobject 2734 + * being embedded in the ext4_sb_info structure which is definitely not 2735 + * true in this case. 2736 + */ 2737 + static const struct sysfs_ops ext4_feat_ops = { 2738 + .show = ext4_feat_show, 2739 + .store = NULL, 2740 + }; 2741 + 2731 2742 static struct kobj_type ext4_feat_ktype = { 2732 2743 .default_attrs = ext4_feat_attrs, 2733 - .sysfs_ops = &ext4_attr_ops, 2744 + .sysfs_ops = &ext4_feat_ops, 2734 2745 .release = ext4_feat_release, 2735 2746 }; 2736 2747 ··· 3200 3179 int compat, incompat; 3201 3180 struct ext4_sb_info *sbi = EXT4_SB(sb); 3202 3181 3203 - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 3204 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { 3182 + if (ext4_has_metadata_csum(sb)) { 3205 3183 /* journal checksum v3 */ 3206 3184 compat = 0; 3207 3185 incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; ··· 3210 3190 incompat = 0; 3211 3191 } 3212 3192 3193 + jbd2_journal_clear_features(sbi->s_journal, 3194 + JBD2_FEATURE_COMPAT_CHECKSUM, 0, 3195 + JBD2_FEATURE_INCOMPAT_CSUM_V3 | 3196 + JBD2_FEATURE_INCOMPAT_CSUM_V2); 3213 3197 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 3214 3198 ret = jbd2_journal_set_features(sbi->s_journal, 3215 3199 compat, 0, ··· 3226 3202 jbd2_journal_clear_features(sbi->s_journal, 0, 0, 3227 3203 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 3228 3204 } else { 3229 - jbd2_journal_clear_features(sbi->s_journal, 3230 - JBD2_FEATURE_COMPAT_CHECKSUM, 0, 3231 - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | 3232 - JBD2_FEATURE_INCOMPAT_CSUM_V3 | 3233 - JBD2_FEATURE_INCOMPAT_CSUM_V2); 3205 + jbd2_journal_clear_features(sbi->s_journal, 0, 0, 3206 + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 3234 3207 } 3235 3208 3236 3209 return ret; ··· 3457 3436 logical_sb_block = sb_block; 3458 3437 } 3459 3438 3460 - if (!(bh = sb_bread(sb, logical_sb_block))) { 3439 + if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) { 3461 3440 ext4_msg(sb, KERN_ERR, "unable to read superblock"); 3462 3441 goto out_fail; 3463 3442 } ··· 3508 3487 } 3509 3488 3510 3489 /* Precompute checksum seed for all metadata */ 3511 - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 3512 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 3490 + if (ext4_has_metadata_csum(sb)) 3513 3491 sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, 3514 3492 sizeof(es->s_uuid)); 3515 3493 ··· 3539 3519 set_opt(sb, ERRORS_CONT); 3540 3520 else 3541 3521 set_opt(sb, ERRORS_RO); 3542 - if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY) 3543 - set_opt(sb, BLOCK_VALIDITY); 3522 + /* block_validity enabled by default; disable with noblock_validity */ 3523 + set_opt(sb, BLOCK_VALIDITY); 3544 3524 if (def_mount_opts & EXT4_DEFM_DISCARD) 3545 3525 set_opt(sb, DISCARD); 3546 3526 ··· 3666 3646 brelse(bh); 3667 3647 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; 3668 3648 offset = do_div(logical_sb_block, blocksize); 3669 - bh = sb_bread(sb, logical_sb_block); 3649 + bh = sb_bread_unmovable(sb, logical_sb_block); 3670 3650 if (!bh) { 3671 3651 ext4_msg(sb, KERN_ERR, 3672 3652 "Can't read superblock on 2nd try"); ··· 3888 3868 3889 3869 for (i = 0; i < db_count; i++) { 3890 3870 block = descriptor_loc(sb, logical_sb_block, i); 3891 - sbi->s_group_desc[i] = sb_bread(sb, block); 3871 + sbi->s_group_desc[i] = sb_bread_unmovable(sb, block); 3892 3872 if (!sbi->s_group_desc[i]) { 3893 3873 ext4_msg(sb, KERN_ERR, 3894 3874 "can't read group descriptor %d", i); ··· 3910 3890 sbi->s_err_report.data = (unsigned long) sb; 3911 3891 3912 3892 /* Register extent status tree shrinker */ 3913 - ext4_es_register_shrinker(sbi); 3914 - 3915 - err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL); 3916 - if (err) { 3917 - ext4_msg(sb, KERN_ERR, "insufficient memory"); 3893 + if (ext4_es_register_shrinker(sbi)) 3918 3894 goto failed_mount3; 3919 - } 3920 3895 3921 3896 sbi->s_stripe = ext4_get_stripe_size(sbi); 3922 3897 sbi->s_extent_max_zeroout_kb = 32; ··· 3919 3904 /* 3920 3905 * set up enough so that it can read an inode 3921 3906 */ 3922 - if (!test_opt(sb, NOLOAD) && 3923 - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) 3924 - sb->s_op = &ext4_sops; 3925 - else 3926 - sb->s_op = &ext4_nojournal_sops; 3907 + sb->s_op = &ext4_sops; 3927 3908 sb->s_export_op = &ext4_export_ops; 3928 3909 sb->s_xattr = ext4_xattr_handlers; 3929 3910 #ifdef CONFIG_QUOTA ··· 4240 4229 jbd2_journal_destroy(sbi->s_journal); 4241 4230 sbi->s_journal = NULL; 4242 4231 } 4243 - failed_mount3: 4244 4232 ext4_es_unregister_shrinker(sbi); 4233 + failed_mount3: 4245 4234 del_timer_sync(&sbi->s_err_report); 4246 - percpu_counter_destroy(&sbi->s_extent_cache_cnt); 4247 4235 if (sbi->s_mmp_tsk) 4248 4236 kthread_stop(sbi->s_mmp_tsk); 4249 4237 failed_mount2: ··· 4257 4247 remove_proc_entry(sb->s_id, ext4_proc_root); 4258 4248 } 4259 4249 #ifdef CONFIG_QUOTA 4260 - for (i = 0; i < MAXQUOTAS; i++) 4250 + for (i = 0; i < EXT4_MAXQUOTAS; i++) 4261 4251 kfree(sbi->s_qf_names[i]); 4262 4252 #endif 4263 4253 ext4_blkdev_remove(sbi); ··· 4381 4371 EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { 4382 4372 ext4_msg(sb, KERN_ERR, "external journal has " 4383 4373 "bad superblock"); 4374 + brelse(bh); 4375 + goto out_bdev; 4376 + } 4377 + 4378 + if ((le32_to_cpu(es->s_feature_ro_compat) & 4379 + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && 4380 + es->s_checksum != ext4_superblock_csum(sb, es)) { 4381 + ext4_msg(sb, KERN_ERR, "external journal has " 4382 + "corrupt superblock"); 4384 4383 brelse(bh); 4385 4384 goto out_bdev; 4386 4385 } ··· 4696 4677 * being sent at the end of the function. But we can skip it if 4697 4678 * transaction_commit will do it for us. 4698 4679 */ 4699 - target = jbd2_get_latest_transaction(sbi->s_journal); 4700 - if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && 4701 - !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) 4702 - needs_barrier = true; 4680 + if (sbi->s_journal) { 4681 + target = jbd2_get_latest_transaction(sbi->s_journal); 4682 + if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && 4683 + !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) 4684 + needs_barrier = true; 4703 4685 4704 - if (jbd2_journal_start_commit(sbi->s_journal, &target)) { 4705 - if (wait) 4706 - ret = jbd2_log_wait_commit(sbi->s_journal, target); 4707 - } 4686 + if (jbd2_journal_start_commit(sbi->s_journal, &target)) { 4687 + if (wait) 4688 + ret = jbd2_log_wait_commit(sbi->s_journal, 4689 + target); 4690 + } 4691 + } else if (wait && test_opt(sb, BARRIER)) 4692 + needs_barrier = true; 4708 4693 if (needs_barrier) { 4709 4694 int err; 4710 4695 err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); 4711 4696 if (!ret) 4712 4697 ret = err; 4713 4698 } 4714 - 4715 - return ret; 4716 - } 4717 - 4718 - static int ext4_sync_fs_nojournal(struct super_block *sb, int wait) 4719 - { 4720 - int ret = 0; 4721 - 4722 - trace_ext4_sync_fs(sb, wait); 4723 - flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq); 4724 - dquot_writeback_dquots(sb, -1); 4725 - if (wait && test_opt(sb, BARRIER)) 4726 - ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); 4727 4699 4728 4700 return ret; 4729 4701 } ··· 4737 4727 4738 4728 journal = EXT4_SB(sb)->s_journal; 4739 4729 4740 - /* Now we set up the journal barrier. */ 4741 - jbd2_journal_lock_updates(journal); 4730 + if (journal) { 4731 + /* Now we set up the journal barrier. */ 4732 + jbd2_journal_lock_updates(journal); 4742 4733 4743 - /* 4744 - * Don't clear the needs_recovery flag if we failed to flush 4745 - * the journal. 4746 - */ 4747 - error = jbd2_journal_flush(journal); 4748 - if (error < 0) 4749 - goto out; 4734 + /* 4735 + * Don't clear the needs_recovery flag if we failed to 4736 + * flush the journal. 4737 + */ 4738 + error = jbd2_journal_flush(journal); 4739 + if (error < 0) 4740 + goto out; 4741 + } 4750 4742 4751 4743 /* Journal blocked and flushed, clear needs_recovery flag. */ 4752 4744 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 4753 4745 error = ext4_commit_super(sb, 1); 4754 4746 out: 4755 - /* we rely on upper layer to stop further updates */ 4756 - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 4747 + if (journal) 4748 + /* we rely on upper layer to stop further updates */ 4749 + jbd2_journal_unlock_updates(journal); 4757 4750 return error; 4758 4751 } 4759 4752 ··· 4787 4774 u32 s_min_batch_time, s_max_batch_time; 4788 4775 #ifdef CONFIG_QUOTA 4789 4776 int s_jquota_fmt; 4790 - char *s_qf_names[MAXQUOTAS]; 4777 + char *s_qf_names[EXT4_MAXQUOTAS]; 4791 4778 #endif 4792 4779 }; 4793 4780 ··· 4817 4804 old_opts.s_max_batch_time = sbi->s_max_batch_time; 4818 4805 #ifdef CONFIG_QUOTA 4819 4806 old_opts.s_jquota_fmt = sbi->s_jquota_fmt; 4820 - for (i = 0; i < MAXQUOTAS; i++) 4807 + for (i = 0; i < EXT4_MAXQUOTAS; i++) 4821 4808 if (sbi->s_qf_names[i]) { 4822 4809 old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], 4823 4810 GFP_KERNEL); ··· 4978 4965 4979 4966 #ifdef CONFIG_QUOTA 4980 4967 /* Release old quota file names */ 4981 - for (i = 0; i < MAXQUOTAS; i++) 4968 + for (i = 0; i < EXT4_MAXQUOTAS; i++) 4982 4969 kfree(old_opts.s_qf_names[i]); 4983 4970 if (enable_quota) { 4984 4971 if (sb_any_quota_suspended(sb)) ··· 5007 4994 sbi->s_max_batch_time = old_opts.s_max_batch_time; 5008 4995 #ifdef CONFIG_QUOTA 5009 4996 sbi->s_jquota_fmt = old_opts.s_jquota_fmt; 5010 - for (i = 0; i < MAXQUOTAS; i++) { 4997 + for (i = 0; i < EXT4_MAXQUOTAS; i++) { 5011 4998 kfree(sbi->s_qf_names[i]); 5012 4999 sbi->s_qf_names[i] = old_opts.s_qf_names[i]; 5013 5000 } ··· 5210 5197 { 5211 5198 int err; 5212 5199 struct inode *qf_inode; 5213 - unsigned long qf_inums[MAXQUOTAS] = { 5200 + unsigned long qf_inums[EXT4_MAXQUOTAS] = { 5214 5201 le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), 5215 5202 le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) 5216 5203 }; ··· 5238 5225 static int ext4_enable_quotas(struct super_block *sb) 5239 5226 { 5240 5227 int type, err = 0; 5241 - unsigned long qf_inums[MAXQUOTAS] = { 5228 + unsigned long qf_inums[EXT4_MAXQUOTAS] = { 5242 5229 le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), 5243 5230 le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) 5244 5231 }; 5245 5232 5246 5233 sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; 5247 - for (type = 0; type < MAXQUOTAS; type++) { 5234 + for (type = 0; type < EXT4_MAXQUOTAS; type++) { 5248 5235 if (qf_inums[type]) { 5249 5236 err = ext4_quota_enable(sb, type, QFMT_VFS_V1, 5250 5237 DQUOT_USAGE_ENABLED); ··· 5322 5309 { 5323 5310 struct inode *inode = sb_dqopt(sb)->files[type]; 5324 5311 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); 5325 - int err = 0; 5326 5312 int offset = off & (sb->s_blocksize - 1); 5327 5313 int tocopy; 5328 5314 size_t toread; ··· 5336 5324 while (toread > 0) { 5337 5325 tocopy = sb->s_blocksize - offset < toread ? 5338 5326 sb->s_blocksize - offset : toread; 5339 - bh = ext4_bread(NULL, inode, blk, 0, &err); 5340 - if (err) 5341 - return err; 5327 + bh = ext4_bread(NULL, inode, blk, 0); 5328 + if (IS_ERR(bh)) 5329 + return PTR_ERR(bh); 5342 5330 if (!bh) /* A hole? */ 5343 5331 memset(data, 0, tocopy); 5344 5332 else ··· 5359 5347 { 5360 5348 struct inode *inode = sb_dqopt(sb)->files[type]; 5361 5349 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); 5362 - int err = 0; 5363 - int offset = off & (sb->s_blocksize - 1); 5350 + int err, offset = off & (sb->s_blocksize - 1); 5364 5351 struct buffer_head *bh; 5365 5352 handle_t *handle = journal_current_handle(); 5366 5353 ··· 5380 5369 return -EIO; 5381 5370 } 5382 5371 5383 - bh = ext4_bread(handle, inode, blk, 1, &err); 5372 + bh = ext4_bread(handle, inode, blk, 1); 5373 + if (IS_ERR(bh)) 5374 + return PTR_ERR(bh); 5384 5375 if (!bh) 5385 5376 goto out; 5386 5377 BUFFER_TRACE(bh, "get write access"); 5387 5378 err = ext4_journal_get_write_access(handle, bh); 5388 5379 if (err) { 5389 5380 brelse(bh); 5390 - goto out; 5381 + return err; 5391 5382 } 5392 5383 lock_buffer(bh); 5393 5384 memcpy(bh->b_data+offset, data, len); ··· 5398 5385 err = ext4_handle_dirty_metadata(handle, NULL, bh); 5399 5386 brelse(bh); 5400 5387 out: 5401 - if (err) 5402 - return err; 5403 5388 if (inode->i_size < off + len) { 5404 5389 i_size_write(inode, off + len); 5405 5390 EXT4_I(inode)->i_disksize = inode->i_size;

+26 -18

fs/ext4/xattr.c

··· 142 142 sector_t block_nr, 143 143 struct ext4_xattr_header *hdr) 144 144 { 145 - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 146 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && 145 + if (ext4_has_metadata_csum(inode->i_sb) && 147 146 (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr))) 148 147 return 0; 149 148 return 1; ··· 152 153 sector_t block_nr, 153 154 struct ext4_xattr_header *hdr) 154 155 { 155 - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 156 - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 156 + if (!ext4_has_metadata_csum(inode->i_sb)) 157 157 return; 158 158 159 159 hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr); ··· 188 190 } 189 191 190 192 static int 191 - ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end) 193 + ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, 194 + void *value_start) 192 195 { 193 - while (!IS_LAST_ENTRY(entry)) { 194 - struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry); 196 + struct ext4_xattr_entry *e = entry; 197 + 198 + while (!IS_LAST_ENTRY(e)) { 199 + struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); 195 200 if ((void *)next >= end) 196 201 return -EIO; 197 - entry = next; 202 + e = next; 198 203 } 204 + 205 + while (!IS_LAST_ENTRY(entry)) { 206 + if (entry->e_value_size != 0 && 207 + (value_start + le16_to_cpu(entry->e_value_offs) < 208 + (void *)e + sizeof(__u32) || 209 + value_start + le16_to_cpu(entry->e_value_offs) + 210 + le32_to_cpu(entry->e_value_size) > end)) 211 + return -EIO; 212 + entry = EXT4_XATTR_NEXT(entry); 213 + } 214 + 199 215 return 0; 200 216 } 201 217 ··· 226 214 return -EIO; 227 215 if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh))) 228 216 return -EIO; 229 - error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); 217 + error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size, 218 + bh->b_data); 230 219 if (!error) 231 220 set_buffer_verified(bh); 232 221 return error; ··· 344 331 header = IHDR(inode, raw_inode); 345 332 entry = IFIRST(header); 346 333 end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; 347 - error = ext4_xattr_check_names(entry, end); 334 + error = ext4_xattr_check_names(entry, end, entry); 348 335 if (error) 349 336 goto cleanup; 350 337 error = ext4_xattr_find_entry(&entry, name_index, name, ··· 476 463 raw_inode = ext4_raw_inode(&iloc); 477 464 header = IHDR(inode, raw_inode); 478 465 end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; 479 - error = ext4_xattr_check_names(IFIRST(header), end); 466 + error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header)); 480 467 if (error) 481 468 goto cleanup; 482 469 error = ext4_xattr_list_entries(dentry, IFIRST(header), ··· 912 899 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 913 900 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; 914 901 915 - /* 916 - * take i_data_sem because we will test 917 - * i_delalloc_reserved_flag in ext4_mb_new_blocks 918 - */ 919 - down_read(&EXT4_I(inode)->i_data_sem); 920 902 block = ext4_new_meta_blocks(handle, inode, goal, 0, 921 903 NULL, &error); 922 - up_read((&EXT4_I(inode)->i_data_sem)); 923 904 if (error) 924 905 goto cleanup; 925 906 ··· 993 986 is->s.here = is->s.first; 994 987 is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; 995 988 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { 996 - error = ext4_xattr_check_names(IFIRST(header), is->s.end); 989 + error = ext4_xattr_check_names(IFIRST(header), is->s.end, 990 + IFIRST(header)); 997 991 if (error) 998 992 return error; 999 993 /* Find the named attribute. */

+1 -1

fs/jbd/journal.c

··· 886 886 goto out_err; 887 887 } 888 888 889 - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 889 + bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize); 890 890 if (!bh) { 891 891 printk(KERN_ERR 892 892 "%s: Cannot get buffer for journal superblock\n",

+138 -200

fs/jbd2/checkpoint.c

··· 96 96 97 97 if (jh->b_transaction == NULL && !buffer_locked(bh) && 98 98 !buffer_dirty(bh) && !buffer_write_io_error(bh)) { 99 - /* 100 - * Get our reference so that bh cannot be freed before 101 - * we unlock it 102 - */ 103 - get_bh(bh); 104 99 JBUFFER_TRACE(jh, "remove from checkpoint list"); 105 100 ret = __jbd2_journal_remove_checkpoint(jh) + 1; 106 - BUFFER_TRACE(bh, "release"); 107 - __brelse(bh); 108 101 } 109 102 return ret; 110 103 } ··· 115 122 116 123 nblocks = jbd2_space_needed(journal); 117 124 while (jbd2_log_space_left(journal) < nblocks) { 118 - if (journal->j_flags & JBD2_ABORT) 119 - return; 120 125 write_unlock(&journal->j_state_lock); 121 126 mutex_lock(&journal->j_checkpoint_mutex); 122 127 ··· 130 139 * trace for forensic evidence. 131 140 */ 132 141 write_lock(&journal->j_state_lock); 142 + if (journal->j_flags & JBD2_ABORT) { 143 + mutex_unlock(&journal->j_checkpoint_mutex); 144 + return; 145 + } 133 146 spin_lock(&journal->j_list_lock); 134 147 nblocks = jbd2_space_needed(journal); 135 148 space_left = jbd2_log_space_left(journal); ··· 178 183 } 179 184 } 180 185 181 - /* 182 - * Clean up transaction's list of buffers submitted for io. 183 - * We wait for any pending IO to complete and remove any clean 184 - * buffers. Note that we take the buffers in the opposite ordering 185 - * from the one in which they were submitted for IO. 186 - * 187 - * Return 0 on success, and return <0 if some buffers have failed 188 - * to be written out. 189 - * 190 - * Called with j_list_lock held. 191 - */ 192 - static int __wait_cp_io(journal_t *journal, transaction_t *transaction) 193 - { 194 - struct journal_head *jh; 195 - struct buffer_head *bh; 196 - tid_t this_tid; 197 - int released = 0; 198 - int ret = 0; 199 - 200 - this_tid = transaction->t_tid; 201 - restart: 202 - /* Did somebody clean up the transaction in the meanwhile? */ 203 - if (journal->j_checkpoint_transactions != transaction || 204 - transaction->t_tid != this_tid) 205 - return ret; 206 - while (!released && transaction->t_checkpoint_io_list) { 207 - jh = transaction->t_checkpoint_io_list; 208 - bh = jh2bh(jh); 209 - get_bh(bh); 210 - if (buffer_locked(bh)) { 211 - spin_unlock(&journal->j_list_lock); 212 - wait_on_buffer(bh); 213 - /* the journal_head may have gone by now */ 214 - BUFFER_TRACE(bh, "brelse"); 215 - __brelse(bh); 216 - spin_lock(&journal->j_list_lock); 217 - goto restart; 218 - } 219 - if (unlikely(buffer_write_io_error(bh))) 220 - ret = -EIO; 221 - 222 - /* 223 - * Now in whatever state the buffer currently is, we know that 224 - * it has been written out and so we can drop it from the list 225 - */ 226 - released = __jbd2_journal_remove_checkpoint(jh); 227 - __brelse(bh); 228 - } 229 - 230 - return ret; 231 - } 232 - 233 186 static void 234 187 __flush_batch(journal_t *journal, int *batch_count) 235 188 { ··· 198 255 } 199 256 200 257 /* 201 - * Try to flush one buffer from the checkpoint list to disk. 202 - * 203 - * Return 1 if something happened which requires us to abort the current 204 - * scan of the checkpoint list. Return <0 if the buffer has failed to 205 - * be written out. 206 - * 207 - * Called with j_list_lock held and drops it if 1 is returned 208 - */ 209 - static int __process_buffer(journal_t *journal, struct journal_head *jh, 210 - int *batch_count, transaction_t *transaction) 211 - { 212 - struct buffer_head *bh = jh2bh(jh); 213 - int ret = 0; 214 - 215 - if (buffer_locked(bh)) { 216 - get_bh(bh); 217 - spin_unlock(&journal->j_list_lock); 218 - wait_on_buffer(bh); 219 - /* the journal_head may have gone by now */ 220 - BUFFER_TRACE(bh, "brelse"); 221 - __brelse(bh); 222 - ret = 1; 223 - } else if (jh->b_transaction != NULL) { 224 - transaction_t *t = jh->b_transaction; 225 - tid_t tid = t->t_tid; 226 - 227 - transaction->t_chp_stats.cs_forced_to_close++; 228 - spin_unlock(&journal->j_list_lock); 229 - if (unlikely(journal->j_flags & JBD2_UNMOUNT)) 230 - /* 231 - * The journal thread is dead; so starting and 232 - * waiting for a commit to finish will cause 233 - * us to wait for a _very_ long time. 234 - */ 235 - printk(KERN_ERR "JBD2: %s: " 236 - "Waiting for Godot: block %llu\n", 237 - journal->j_devname, 238 - (unsigned long long) bh->b_blocknr); 239 - jbd2_log_start_commit(journal, tid); 240 - jbd2_log_wait_commit(journal, tid); 241 - ret = 1; 242 - } else if (!buffer_dirty(bh)) { 243 - ret = 1; 244 - if (unlikely(buffer_write_io_error(bh))) 245 - ret = -EIO; 246 - get_bh(bh); 247 - BUFFER_TRACE(bh, "remove from checkpoint"); 248 - __jbd2_journal_remove_checkpoint(jh); 249 - spin_unlock(&journal->j_list_lock); 250 - __brelse(bh); 251 - } else { 252 - /* 253 - * Important: we are about to write the buffer, and 254 - * possibly block, while still holding the journal lock. 255 - * We cannot afford to let the transaction logic start 256 - * messing around with this buffer before we write it to 257 - * disk, as that would break recoverability. 258 - */ 259 - BUFFER_TRACE(bh, "queue"); 260 - get_bh(bh); 261 - J_ASSERT_BH(bh, !buffer_jwrite(bh)); 262 - journal->j_chkpt_bhs[*batch_count] = bh; 263 - __buffer_relink_io(jh); 264 - transaction->t_chp_stats.cs_written++; 265 - (*batch_count)++; 266 - if (*batch_count == JBD2_NR_BATCH) { 267 - spin_unlock(&journal->j_list_lock); 268 - __flush_batch(journal, batch_count); 269 - ret = 1; 270 - } 271 - } 272 - return ret; 273 - } 274 - 275 - /* 276 258 * Perform an actual checkpoint. We take the first transaction on the 277 259 * list of transactions to be checkpointed and send all its buffers 278 260 * to disk. We submit larger chunks of data at once. ··· 207 339 */ 208 340 int jbd2_log_do_checkpoint(journal_t *journal) 209 341 { 210 - transaction_t *transaction; 211 - tid_t this_tid; 212 - int result; 342 + struct journal_head *jh; 343 + struct buffer_head *bh; 344 + transaction_t *transaction; 345 + tid_t this_tid; 346 + int result, batch_count = 0; 213 347 214 348 jbd_debug(1, "Start checkpoint\n"); 215 349 ··· 244 374 * done (maybe it's a new transaction, but it fell at the same 245 375 * address). 246 376 */ 247 - if (journal->j_checkpoint_transactions == transaction && 248 - transaction->t_tid == this_tid) { 249 - int batch_count = 0; 250 - struct journal_head *jh; 251 - int retry = 0, err; 377 + if (journal->j_checkpoint_transactions != transaction || 378 + transaction->t_tid != this_tid) 379 + goto out; 252 380 253 - while (!retry && transaction->t_checkpoint_list) { 254 - jh = transaction->t_checkpoint_list; 255 - retry = __process_buffer(journal, jh, &batch_count, 256 - transaction); 257 - if (retry < 0 && !result) 258 - result = retry; 259 - if (!retry && (need_resched() || 260 - spin_needbreak(&journal->j_list_lock))) { 261 - spin_unlock(&journal->j_list_lock); 262 - retry = 1; 263 - break; 264 - } 381 + /* checkpoint all of the transaction's buffers */ 382 + while (transaction->t_checkpoint_list) { 383 + jh = transaction->t_checkpoint_list; 384 + bh = jh2bh(jh); 385 + 386 + if (buffer_locked(bh)) { 387 + spin_unlock(&journal->j_list_lock); 388 + get_bh(bh); 389 + wait_on_buffer(bh); 390 + /* the journal_head may have gone by now */ 391 + BUFFER_TRACE(bh, "brelse"); 392 + __brelse(bh); 393 + goto retry; 265 394 } 395 + if (jh->b_transaction != NULL) { 396 + transaction_t *t = jh->b_transaction; 397 + tid_t tid = t->t_tid; 266 398 267 - if (batch_count) { 268 - if (!retry) { 269 - spin_unlock(&journal->j_list_lock); 270 - retry = 1; 271 - } 272 - __flush_batch(journal, &batch_count); 399 + transaction->t_chp_stats.cs_forced_to_close++; 400 + spin_unlock(&journal->j_list_lock); 401 + if (unlikely(journal->j_flags & JBD2_UNMOUNT)) 402 + /* 403 + * The journal thread is dead; so 404 + * starting and waiting for a commit 405 + * to finish will cause us to wait for 406 + * a _very_ long time. 407 + */ 408 + printk(KERN_ERR 409 + "JBD2: %s: Waiting for Godot: block %llu\n", 410 + journal->j_devname, (unsigned long long) bh->b_blocknr); 411 + 412 + jbd2_log_start_commit(journal, tid); 413 + jbd2_log_wait_commit(journal, tid); 414 + goto retry; 273 415 } 274 - 275 - if (retry) { 276 - spin_lock(&journal->j_list_lock); 277 - goto restart; 416 + if (!buffer_dirty(bh)) { 417 + if (unlikely(buffer_write_io_error(bh)) && !result) 418 + result = -EIO; 419 + BUFFER_TRACE(bh, "remove from checkpoint"); 420 + if (__jbd2_journal_remove_checkpoint(jh)) 421 + /* The transaction was released; we're done */ 422 + goto out; 423 + continue; 278 424 } 279 425 /* 280 - * Now we have cleaned up the first transaction's checkpoint 281 - * list. Let's clean up the second one 426 + * Important: we are about to write the buffer, and 427 + * possibly block, while still holding the journal 428 + * lock. We cannot afford to let the transaction 429 + * logic start messing around with this buffer before 430 + * we write it to disk, as that would break 431 + * recoverability. 282 432 */ 283 - err = __wait_cp_io(journal, transaction); 284 - if (!result) 285 - result = err; 433 + BUFFER_TRACE(bh, "queue"); 434 + get_bh(bh); 435 + J_ASSERT_BH(bh, !buffer_jwrite(bh)); 436 + journal->j_chkpt_bhs[batch_count++] = bh; 437 + __buffer_relink_io(jh); 438 + transaction->t_chp_stats.cs_written++; 439 + if ((batch_count == JBD2_NR_BATCH) || 440 + need_resched() || 441 + spin_needbreak(&journal->j_list_lock)) 442 + goto unlock_and_flush; 443 + } 444 + 445 + if (batch_count) { 446 + unlock_and_flush: 447 + spin_unlock(&journal->j_list_lock); 448 + retry: 449 + if (batch_count) 450 + __flush_batch(journal, &batch_count); 451 + spin_lock(&journal->j_list_lock); 452 + goto restart; 453 + } 454 + 455 + /* 456 + * Now we issued all of the transaction's buffers, let's deal 457 + * with the buffers that are out for I/O. 458 + */ 459 + restart2: 460 + /* Did somebody clean up the transaction in the meanwhile? */ 461 + if (journal->j_checkpoint_transactions != transaction || 462 + transaction->t_tid != this_tid) 463 + goto out; 464 + 465 + while (transaction->t_checkpoint_io_list) { 466 + jh = transaction->t_checkpoint_io_list; 467 + bh = jh2bh(jh); 468 + if (buffer_locked(bh)) { 469 + spin_unlock(&journal->j_list_lock); 470 + get_bh(bh); 471 + wait_on_buffer(bh); 472 + /* the journal_head may have gone by now */ 473 + BUFFER_TRACE(bh, "brelse"); 474 + __brelse(bh); 475 + spin_lock(&journal->j_list_lock); 476 + goto restart2; 477 + } 478 + if (unlikely(buffer_write_io_error(bh)) && !result) 479 + result = -EIO; 480 + 481 + /* 482 + * Now in whatever state the buffer currently is, we 483 + * know that it has been written out and so we can 484 + * drop it from the list 485 + */ 486 + if (__jbd2_journal_remove_checkpoint(jh)) 487 + break; 286 488 } 287 489 out: 288 490 spin_unlock(&journal->j_list_lock); ··· 420 478 * Find all the written-back checkpoint buffers in the given list and 421 479 * release them. 422 480 * 423 - * Called with the journal locked. 424 481 * Called with j_list_lock held. 425 - * Returns number of buffers reaped (for debug) 482 + * Returns 1 if we freed the transaction, 0 otherwise. 426 483 */ 427 - 428 - static int journal_clean_one_cp_list(struct journal_head *jh, int *released) 484 + static int journal_clean_one_cp_list(struct journal_head *jh) 429 485 { 430 486 struct journal_head *last_jh; 431 487 struct journal_head *next_jh = jh; 432 - int ret, freed = 0; 488 + int ret; 489 + int freed = 0; 433 490 434 - *released = 0; 435 491 if (!jh) 436 492 return 0; 437 493 ··· 438 498 jh = next_jh; 439 499 next_jh = jh->b_cpnext; 440 500 ret = __try_to_free_cp_buf(jh); 441 - if (ret) { 442 - freed++; 443 - if (ret == 2) { 444 - *released = 1; 445 - return freed; 446 - } 447 - } 501 + if (!ret) 502 + return freed; 503 + if (ret == 2) 504 + return 1; 505 + freed = 1; 448 506 /* 449 507 * This function only frees up some memory 450 508 * if possible so we dont have an obligation ··· 461 523 * 462 524 * Find all the written-back checkpoint buffers in the journal and release them. 463 525 * 464 - * Called with the journal locked. 465 526 * Called with j_list_lock held. 466 - * Returns number of buffers reaped (for debug) 467 527 */ 468 - 469 - int __jbd2_journal_clean_checkpoint_list(journal_t *journal) 528 + void __jbd2_journal_clean_checkpoint_list(journal_t *journal) 470 529 { 471 530 transaction_t *transaction, *last_transaction, *next_transaction; 472 - int ret = 0; 473 - int released; 531 + int ret; 474 532 475 533 transaction = journal->j_checkpoint_transactions; 476 534 if (!transaction) 477 - goto out; 535 + return; 478 536 479 537 last_transaction = transaction->t_cpprev; 480 538 next_transaction = transaction; 481 539 do { 482 540 transaction = next_transaction; 483 541 next_transaction = transaction->t_cpnext; 484 - ret += journal_clean_one_cp_list(transaction-> 485 - t_checkpoint_list, &released); 542 + ret = journal_clean_one_cp_list(transaction->t_checkpoint_list); 486 543 /* 487 544 * This function only frees up some memory if possible so we 488 545 * dont have an obligation to finish processing. Bail out if 489 546 * preemption requested: 490 547 */ 491 548 if (need_resched()) 492 - goto out; 493 - if (released) 549 + return; 550 + if (ret) 494 551 continue; 495 552 /* 496 553 * It is essential that we are as careful as in the case of 497 554 * t_checkpoint_list with removing the buffer from the list as 498 555 * we can possibly see not yet submitted buffers on io_list 499 556 */ 500 - ret += journal_clean_one_cp_list(transaction-> 501 - t_checkpoint_io_list, &released); 557 + ret = journal_clean_one_cp_list(transaction-> 558 + t_checkpoint_io_list); 502 559 if (need_resched()) 503 - goto out; 560 + return; 561 + /* 562 + * Stop scanning if we couldn't free the transaction. This 563 + * avoids pointless scanning of transactions which still 564 + * weren't checkpointed. 565 + */ 566 + if (!ret) 567 + return; 504 568 } while (transaction != last_transaction); 505 - out: 506 - return ret; 507 569 } 508 570 509 571 /*

+9 -9

fs/jbd2/journal.c

··· 1237 1237 goto out_err; 1238 1238 } 1239 1239 1240 - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 1240 + bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize); 1241 1241 if (!bh) { 1242 1242 printk(KERN_ERR 1243 1243 "%s: Cannot get buffer for journal superblock\n", ··· 1522 1522 goto out; 1523 1523 } 1524 1524 1525 - if (jbd2_journal_has_csum_v2or3(journal) && 1526 - JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { 1527 - /* Can't have checksum v1 and v2 on at the same time! */ 1528 - printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 " 1529 - "at the same time!\n"); 1530 - goto out; 1531 - } 1532 - 1533 1525 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) && 1534 1526 JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) { 1535 1527 /* Can't have checksum v2 and v3 at the same time! */ 1536 1528 printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 " 1529 + "at the same time!\n"); 1530 + goto out; 1531 + } 1532 + 1533 + if (jbd2_journal_has_csum_v2or3(journal) && 1534 + JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { 1535 + /* Can't have checksum v1 and v2 on at the same time! */ 1536 + printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " 1537 1537 "at the same time!\n"); 1538 1538 goto out; 1539 1539 }

+1

fs/jbd2/recovery.c

··· 525 525 !jbd2_descr_block_csum_verify(journal, 526 526 bh->b_data)) { 527 527 err = -EIO; 528 + brelse(bh); 528 529 goto failed; 529 530 } 530 531

+42 -5

include/linux/buffer_head.h

··· 175 175 wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); 176 176 struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, 177 177 unsigned size); 178 - struct buffer_head *__getblk(struct block_device *bdev, sector_t block, 179 - unsigned size); 178 + struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block, 179 + unsigned size, gfp_t gfp); 180 180 void __brelse(struct buffer_head *); 181 181 void __bforget(struct buffer_head *); 182 182 void __breadahead(struct block_device *, sector_t block, unsigned int size); 183 - struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size); 183 + struct buffer_head *__bread_gfp(struct block_device *, 184 + sector_t block, unsigned size, gfp_t gfp); 184 185 void invalidate_bh_lrus(void); 185 186 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); 186 187 void free_buffer_head(struct buffer_head * bh); ··· 296 295 static inline struct buffer_head * 297 296 sb_bread(struct super_block *sb, sector_t block) 298 297 { 299 - return __bread(sb->s_bdev, block, sb->s_blocksize); 298 + return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE); 299 + } 300 + 301 + static inline struct buffer_head * 302 + sb_bread_unmovable(struct super_block *sb, sector_t block) 303 + { 304 + return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, 0); 300 305 } 301 306 302 307 static inline void ··· 314 307 static inline struct buffer_head * 315 308 sb_getblk(struct super_block *sb, sector_t block) 316 309 { 317 - return __getblk(sb->s_bdev, block, sb->s_blocksize); 310 + return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE); 318 311 } 319 312 320 313 static inline struct buffer_head * ··· 349 342 might_sleep(); 350 343 if (!trylock_buffer(bh)) 351 344 __lock_buffer(bh); 345 + } 346 + 347 + static inline struct buffer_head *getblk_unmovable(struct block_device *bdev, 348 + sector_t block, 349 + unsigned size) 350 + { 351 + return __getblk_gfp(bdev, block, size, 0); 352 + } 353 + 354 + static inline struct buffer_head *__getblk(struct block_device *bdev, 355 + sector_t block, 356 + unsigned size) 357 + { 358 + return __getblk_gfp(bdev, block, size, __GFP_MOVABLE); 359 + } 360 + 361 + /** 362 + * __bread() - reads a specified block and returns the bh 363 + * @bdev: the block_device to read from 364 + * @block: number of block 365 + * @size: size (in bytes) to read 366 + * 367 + * Reads a specified block, and returns buffer head that contains it. 368 + * The page cache is allocated from movable area so that it can be migrated. 369 + * It returns NULL if the block was unreadable. 370 + */ 371 + static inline struct buffer_head * 372 + __bread(struct block_device *bdev, sector_t block, unsigned size) 373 + { 374 + return __bread_gfp(bdev, block, size, __GFP_MOVABLE); 352 375 } 353 376 354 377 extern int __set_page_dirty_buffers(struct page *page);

+1 -1

include/linux/jbd2.h

··· 1042 1042 extern void jbd2_journal_commit_transaction(journal_t *); 1043 1043 1044 1044 /* Checkpoint list management */ 1045 - int __jbd2_journal_clean_checkpoint_list(journal_t *journal); 1045 + void __jbd2_journal_clean_checkpoint_list(journal_t *journal); 1046 1046 int __jbd2_journal_remove_checkpoint(struct journal_head *); 1047 1047 void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); 1048 1048

+1

include/linux/mm.h

··· 1176 1176 1177 1177 extern void truncate_pagecache(struct inode *inode, loff_t new); 1178 1178 extern void truncate_setsize(struct inode *inode, loff_t newsize); 1179 + void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); 1179 1180 void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); 1180 1181 int truncate_inode_page(struct address_space *mapping, struct page *page); 1181 1182 int generic_error_remove_page(struct address_space *mapping, struct page *page);

+51 -8

include/trace/events/ext4.h

··· 2369 2369 show_extent_status(__entry->found ? __entry->status : 0)) 2370 2370 ); 2371 2371 2372 - TRACE_EVENT(ext4_es_shrink_enter, 2372 + DECLARE_EVENT_CLASS(ext4__es_shrink_enter, 2373 2373 TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), 2374 2374 2375 2375 TP_ARGS(sb, nr_to_scan, cache_cnt), ··· 2391 2391 __entry->nr_to_scan, __entry->cache_cnt) 2392 2392 ); 2393 2393 2394 - TRACE_EVENT(ext4_es_shrink_exit, 2395 - TP_PROTO(struct super_block *sb, int shrunk_nr, int cache_cnt), 2394 + DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_count, 2395 + TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), 2396 2396 2397 - TP_ARGS(sb, shrunk_nr, cache_cnt), 2397 + TP_ARGS(sb, nr_to_scan, cache_cnt) 2398 + ); 2399 + 2400 + DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_scan_enter, 2401 + TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), 2402 + 2403 + TP_ARGS(sb, nr_to_scan, cache_cnt) 2404 + ); 2405 + 2406 + TRACE_EVENT(ext4_es_shrink_scan_exit, 2407 + TP_PROTO(struct super_block *sb, int nr_shrunk, int cache_cnt), 2408 + 2409 + TP_ARGS(sb, nr_shrunk, cache_cnt), 2398 2410 2399 2411 TP_STRUCT__entry( 2400 2412 __field( dev_t, dev ) 2401 - __field( int, shrunk_nr ) 2413 + __field( int, nr_shrunk ) 2402 2414 __field( int, cache_cnt ) 2403 2415 ), 2404 2416 2405 2417 TP_fast_assign( 2406 2418 __entry->dev = sb->s_dev; 2407 - __entry->shrunk_nr = shrunk_nr; 2419 + __entry->nr_shrunk = nr_shrunk; 2408 2420 __entry->cache_cnt = cache_cnt; 2409 2421 ), 2410 2422 2411 - TP_printk("dev %d,%d shrunk_nr %d cache_cnt %d", 2423 + TP_printk("dev %d,%d nr_shrunk %d cache_cnt %d", 2412 2424 MAJOR(__entry->dev), MINOR(__entry->dev), 2413 - __entry->shrunk_nr, __entry->cache_cnt) 2425 + __entry->nr_shrunk, __entry->cache_cnt) 2414 2426 ); 2415 2427 2416 2428 TRACE_EVENT(ext4_collapse_range, ··· 2448 2436 MAJOR(__entry->dev), MINOR(__entry->dev), 2449 2437 (unsigned long) __entry->ino, 2450 2438 __entry->offset, __entry->len) 2439 + ); 2440 + 2441 + TRACE_EVENT(ext4_es_shrink, 2442 + TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time, 2443 + int skip_precached, int nr_skipped, int retried), 2444 + 2445 + TP_ARGS(sb, nr_shrunk, scan_time, skip_precached, nr_skipped, retried), 2446 + 2447 + TP_STRUCT__entry( 2448 + __field( dev_t, dev ) 2449 + __field( int, nr_shrunk ) 2450 + __field( unsigned long long, scan_time ) 2451 + __field( int, skip_precached ) 2452 + __field( int, nr_skipped ) 2453 + __field( int, retried ) 2454 + ), 2455 + 2456 + TP_fast_assign( 2457 + __entry->dev = sb->s_dev; 2458 + __entry->nr_shrunk = nr_shrunk; 2459 + __entry->scan_time = div_u64(scan_time, 1000); 2460 + __entry->skip_precached = skip_precached; 2461 + __entry->nr_skipped = nr_skipped; 2462 + __entry->retried = retried; 2463 + ), 2464 + 2465 + TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu skip_precached %d " 2466 + "nr_skipped %d retried %d", 2467 + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk, 2468 + __entry->scan_time, __entry->skip_precached, 2469 + __entry->nr_skipped, __entry->retried) 2451 2470 ); 2452 2471 2453 2472 #endif /* _TRACE_EXT4_H */

+57

mm/truncate.c

··· 20 20 #include <linux/buffer_head.h> /* grr. try_to_release_page, 21 21 do_invalidatepage */ 22 22 #include <linux/cleancache.h> 23 + #include <linux/rmap.h> 23 24 #include "internal.h" 24 25 25 26 static void clear_exceptional_entry(struct address_space *mapping, ··· 720 719 */ 721 720 void truncate_setsize(struct inode *inode, loff_t newsize) 722 721 { 722 + loff_t oldsize = inode->i_size; 723 + 723 724 i_size_write(inode, newsize); 725 + if (newsize > oldsize) 726 + pagecache_isize_extended(inode, oldsize, newsize); 724 727 truncate_pagecache(inode, newsize); 725 728 } 726 729 EXPORT_SYMBOL(truncate_setsize); 730 + 731 + /** 732 + * pagecache_isize_extended - update pagecache after extension of i_size 733 + * @inode: inode for which i_size was extended 734 + * @from: original inode size 735 + * @to: new inode size 736 + * 737 + * Handle extension of inode size either caused by extending truncate or by 738 + * write starting after current i_size. We mark the page straddling current 739 + * i_size RO so that page_mkwrite() is called on the nearest write access to 740 + * the page. This way filesystem can be sure that page_mkwrite() is called on 741 + * the page before user writes to the page via mmap after the i_size has been 742 + * changed. 743 + * 744 + * The function must be called after i_size is updated so that page fault 745 + * coming after we unlock the page will already see the new i_size. 746 + * The function must be called while we still hold i_mutex - this not only 747 + * makes sure i_size is stable but also that userspace cannot observe new 748 + * i_size value before we are prepared to store mmap writes at new inode size. 749 + */ 750 + void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) 751 + { 752 + int bsize = 1 << inode->i_blkbits; 753 + loff_t rounded_from; 754 + struct page *page; 755 + pgoff_t index; 756 + 757 + WARN_ON(!mutex_is_locked(&inode->i_mutex)); 758 + WARN_ON(to > inode->i_size); 759 + 760 + if (from >= to || bsize == PAGE_CACHE_SIZE) 761 + return; 762 + /* Page straddling @from will not have any hole block created? */ 763 + rounded_from = round_up(from, bsize); 764 + if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1))) 765 + return; 766 + 767 + index = from >> PAGE_CACHE_SHIFT; 768 + page = find_lock_page(inode->i_mapping, index); 769 + /* Page not cached? Nothing to do */ 770 + if (!page) 771 + return; 772 + /* 773 + * See clear_page_dirty_for_io() for details why set_page_dirty() 774 + * is needed. 775 + */ 776 + if (page_mkclean(page)) 777 + set_page_dirty(page); 778 + unlock_page(page); 779 + page_cache_release(page); 780 + } 781 + EXPORT_SYMBOL(pagecache_isize_extended); 727 782 728 783 /** 729 784 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched