commit 4515c3069da5bfb6f08dbfca499464b4cbdfcb50

+8 -2

Documentation/filesystems/ext4.txt

··· 153 identified through its new major/minor numbers encoded 154 in devnum. 155 156 - noload Don't load the journal on mounting. Note that 157 - if the filesystem was not unmounted cleanly, 158 skipping the journal replay will lead to the 159 filesystem containing inconsistencies that can 160 lead to any number of problems. ··· 352 "zero-length" problem that can happen when a 353 system crashes before the delayed allocation 354 blocks are forced to disk. 355 356 Data Mode 357 =========

··· 153 identified through its new major/minor numbers encoded 154 in devnum. 155 156 + norecovery Don't load the journal on mounting. Note that 157 + noload if the filesystem was not unmounted cleanly, 158 skipping the journal replay will lead to the 159 filesystem containing inconsistencies that can 160 lead to any number of problems. ··· 352 "zero-length" problem that can happen when a 353 system crashes before the delayed allocation 354 blocks are forced to disk. 355 + 356 + discard Controls whether ext4 should issue discard/TRIM 357 + nodiscard(*) commands to the underlying block device when 358 + blocks are freed. This is useful for SSD devices 359 + and sparse/thinly-provisioned LUNs, but it is off 360 + by default until sufficient testing has been done. 361 362 Data Mode 363 =========

+10

fs/ext4/Kconfig

··· 26 27 If unsure, say N. 28 29 config EXT4_FS_XATTR 30 bool "Ext4 extended attributes" 31 depends on EXT4_FS

··· 26 27 If unsure, say N. 28 29 + config EXT4_USE_FOR_EXT23 30 + bool "Use ext4 for ext2/ext3 file systems" 31 + depends on EXT3_FS=n || EXT2_FS=n 32 + default y 33 + help 34 + Allow the ext4 file system driver code to be used for ext2 or 35 + ext3 file system mounts. This allows users to reduce their 36 + compiled kernel size by using one file system driver for 37 + ext2, ext3, and ext4 file systems. 38 + 39 config EXT4_FS_XATTR 40 bool "Ext4 extended attributes" 41 depends on EXT4_FS

+7 -39

fs/ext4/balloc.c

··· 499 } 500 501 /** 502 - * ext4_free_blocks() -- Free given blocks and update quota 503 - * @handle: handle for this transaction 504 - * @inode: inode 505 - * @block: start physical block to free 506 - * @count: number of blocks to count 507 - * @metadata: Are these metadata blocks 508 - */ 509 - void ext4_free_blocks(handle_t *handle, struct inode *inode, 510 - ext4_fsblk_t block, unsigned long count, 511 - int metadata) 512 - { 513 - struct super_block *sb; 514 - unsigned long dquot_freed_blocks; 515 - 516 - /* this isn't the right place to decide whether block is metadata 517 - * inode.c/extents.c knows better, but for safety ... */ 518 - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 519 - metadata = 1; 520 - 521 - /* We need to make sure we don't reuse 522 - * block released untill the transaction commit. 523 - * writeback mode have weak data consistency so 524 - * don't force data as metadata when freeing block 525 - * for writeback mode. 526 - */ 527 - if (metadata == 0 && !ext4_should_writeback_data(inode)) 528 - metadata = 1; 529 - 530 - sb = inode->i_sb; 531 - 532 - ext4_mb_free_blocks(handle, inode, block, count, 533 - metadata, &dquot_freed_blocks); 534 - if (dquot_freed_blocks) 535 - vfs_dq_free_block(inode, dquot_freed_blocks); 536 - return; 537 - } 538 - 539 - /** 540 * ext4_has_free_blocks() 541 * @sbi: in-core super block structure. 542 * @nblocks: number of needed blocks ··· 723 static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, 724 ext4_group_t group) 725 { 726 - return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0; 727 } 728 729 /**

··· 499 } 500 501 /** 502 * ext4_has_free_blocks() 503 * @sbi: in-core super block structure. 504 * @nblocks: number of needed blocks ··· 761 static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, 762 ext4_group_t group) 763 { 764 + if (!ext4_bg_has_super(sb, group)) 765 + return 0; 766 + 767 + if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG)) 768 + return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); 769 + else 770 + return EXT4_SB(sb)->s_gdb_count; 771 } 772 773 /**

+2 -1

fs/ext4/block_validity.c

··· 160 if (ext4_bg_has_super(sb, i) && 161 ((i < 5) || ((i % flex_size) == 0))) 162 add_system_zone(sbi, ext4_group_first_block_no(sb, i), 163 - sbi->s_gdb_count + 1); 164 gdp = ext4_get_group_desc(sb, i, NULL); 165 ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); 166 if (ret) ··· 228 struct rb_node *n = sbi->system_blks.rb_node; 229 230 if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || 231 (start_blk + count > ext4_blocks_count(sbi->s_es))) 232 return 0; 233 while (n) {

··· 160 if (ext4_bg_has_super(sb, i) && 161 ((i < 5) || ((i % flex_size) == 0))) 162 add_system_zone(sbi, ext4_group_first_block_no(sb, i), 163 + ext4_bg_num_gdb(sb, i) + 1); 164 gdp = ext4_get_group_desc(sb, i, NULL); 165 ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); 166 if (ret) ··· 228 struct rb_node *n = sbi->system_blks.rb_node; 229 230 if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || 231 + (start_blk + count < start_blk) || 232 (start_blk + count > ext4_blocks_count(sbi->s_es))) 233 return 0; 234 while (n) {

+17 -6

fs/ext4/ext4.h

··· 376 EXT4_GET_BLOCKS_DIO_CREATE_EXT) 377 378 /* 379 * ioctl commands 380 */ 381 #define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS ··· 709 struct list_head i_aio_dio_complete_list; 710 /* current io_end structure for async DIO write*/ 711 ext4_io_end_t *cur_aio_dio; 712 }; 713 714 /* ··· 763 #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 764 #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 765 #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ 766 767 #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 768 #define set_opt(o, opt) o |= EXT4_MOUNT_##opt ··· 1338 ext4_fsblk_t goal, unsigned long *count, int *errp); 1339 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1340 extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1341 - extern void ext4_free_blocks(handle_t *handle, struct inode *inode, 1342 - ext4_fsblk_t block, unsigned long count, int metadata); 1343 extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, 1344 ext4_fsblk_t block, unsigned long count); 1345 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); ··· 1396 extern void ext4_discard_preallocations(struct inode *); 1397 extern int __init init_ext4_mballoc(void); 1398 extern void exit_ext4_mballoc(void); 1399 - extern void ext4_mb_free_blocks(handle_t *, struct inode *, 1400 - ext4_fsblk_t, unsigned long, int, unsigned long *); 1401 extern int ext4_mb_add_groupinfo(struct super_block *sb, 1402 ext4_group_t i, struct ext4_group_desc *desc); 1403 extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); 1404 extern void ext4_mb_put_buddy_cache_lock(struct super_block *, 1405 ext4_group_t, int); 1406 /* inode.c */ 1407 - int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 1408 - struct buffer_head *bh, ext4_fsblk_t blocknr); 1409 struct buffer_head *ext4_getblk(handle_t *, struct inode *, 1410 ext4_lblk_t, int, int *); 1411 struct buffer_head *ext4_bread(handle_t *, struct inode *,

··· 376 EXT4_GET_BLOCKS_DIO_CREATE_EXT) 377 378 /* 379 + * Flags used by ext4_free_blocks 380 + */ 381 + #define EXT4_FREE_BLOCKS_METADATA 0x0001 382 + #define EXT4_FREE_BLOCKS_FORGET 0x0002 383 + 384 + /* 385 * ioctl commands 386 */ 387 #define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS ··· 703 struct list_head i_aio_dio_complete_list; 704 /* current io_end structure for async DIO write*/ 705 ext4_io_end_t *cur_aio_dio; 706 + 707 + /* 708 + * Transactions that contain inode's metadata needed to complete 709 + * fsync and fdatasync, respectively. 710 + */ 711 + tid_t i_sync_tid; 712 + tid_t i_datasync_tid; 713 }; 714 715 /* ··· 750 #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 751 #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 752 #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ 753 + #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ 754 755 #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 756 #define set_opt(o, opt) o |= EXT4_MOUNT_##opt ··· 1324 ext4_fsblk_t goal, unsigned long *count, int *errp); 1325 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1326 extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1327 extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, 1328 ext4_fsblk_t block, unsigned long count); 1329 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); ··· 1384 extern void ext4_discard_preallocations(struct inode *); 1385 extern int __init init_ext4_mballoc(void); 1386 extern void exit_ext4_mballoc(void); 1387 + extern void ext4_free_blocks(handle_t *handle, struct inode *inode, 1388 + struct buffer_head *bh, ext4_fsblk_t block, 1389 + unsigned long count, int flags); 1390 extern int ext4_mb_add_groupinfo(struct super_block *sb, 1391 ext4_group_t i, struct ext4_group_desc *desc); 1392 extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); 1393 extern void ext4_mb_put_buddy_cache_lock(struct super_block *, 1394 ext4_group_t, int); 1395 /* inode.c */ 1396 struct buffer_head *ext4_getblk(handle_t *, struct inode *, 1397 ext4_lblk_t, int, int *); 1398 struct buffer_head *ext4_bread(handle_t *, struct inode *,

+61 -25

fs/ext4/ext4_jbd2.c

··· 4 5 #include "ext4_jbd2.h" 6 7 int __ext4_journal_get_undo_access(const char *where, handle_t *handle, 8 struct buffer_head *bh) 9 { ··· 34 return err; 35 } 36 37 - int __ext4_journal_forget(const char *where, handle_t *handle, 38 - struct buffer_head *bh) 39 { 40 - int err = 0; 41 42 - if (ext4_handle_valid(handle)) { 43 - err = jbd2_journal_forget(handle, bh); 44 - if (err) 45 - ext4_journal_abort_handle(where, __func__, bh, 46 - handle, err); 47 - } 48 - else 49 bforget(bh); 50 - return err; 51 - } 52 - 53 - int __ext4_journal_revoke(const char *where, handle_t *handle, 54 - ext4_fsblk_t blocknr, struct buffer_head *bh) 55 - { 56 - int err = 0; 57 - 58 - if (ext4_handle_valid(handle)) { 59 - err = jbd2_journal_revoke(handle, blocknr, bh); 60 - if (err) 61 - ext4_journal_abort_handle(where, __func__, bh, 62 - handle, err); 63 } 64 - else 65 - bforget(bh); 66 return err; 67 } 68

··· 4 5 #include "ext4_jbd2.h" 6 7 + #include <trace/events/ext4.h> 8 + 9 int __ext4_journal_get_undo_access(const char *where, handle_t *handle, 10 struct buffer_head *bh) 11 { ··· 32 return err; 33 } 34 35 + /* 36 + * The ext4 forget function must perform a revoke if we are freeing data 37 + * which has been journaled. Metadata (eg. indirect blocks) must be 38 + * revoked in all cases. 39 + * 40 + * "bh" may be NULL: a metadata block may have been freed from memory 41 + * but there may still be a record of it in the journal, and that record 42 + * still needs to be revoked. 43 + * 44 + * If the handle isn't valid we're not journaling, but we still need to 45 + * call into ext4_journal_revoke() to put the buffer head. 46 + */ 47 + int __ext4_forget(const char *where, handle_t *handle, int is_metadata, 48 + struct inode *inode, struct buffer_head *bh, 49 + ext4_fsblk_t blocknr) 50 { 51 + int err; 52 53 + might_sleep(); 54 + 55 + trace_ext4_forget(inode, is_metadata, blocknr); 56 + BUFFER_TRACE(bh, "enter"); 57 + 58 + jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " 59 + "data mode %x\n", 60 + bh, is_metadata, inode->i_mode, 61 + test_opt(inode->i_sb, DATA_FLAGS)); 62 + 63 + /* In the no journal case, we can just do a bforget and return */ 64 + if (!ext4_handle_valid(handle)) { 65 bforget(bh); 66 + return 0; 67 } 68 + 69 + /* Never use the revoke function if we are doing full data 70 + * journaling: there is no need to, and a V1 superblock won't 71 + * support it. Otherwise, only skip the revoke on un-journaled 72 + * data blocks. */ 73 + 74 + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || 75 + (!is_metadata && !ext4_should_journal_data(inode))) { 76 + if (bh) { 77 + BUFFER_TRACE(bh, "call jbd2_journal_forget"); 78 + err = jbd2_journal_forget(handle, bh); 79 + if (err) 80 + ext4_journal_abort_handle(where, __func__, bh, 81 + handle, err); 82 + return err; 83 + } 84 + return 0; 85 + } 86 + 87 + /* 88 + * data!=journal && (is_metadata || should_journal_data(inode)) 89 + */ 90 + BUFFER_TRACE(bh, "call jbd2_journal_revoke"); 91 + err = jbd2_journal_revoke(handle, blocknr, bh); 92 + if (err) { 93 + ext4_journal_abort_handle(where, __func__, bh, handle, err); 94 + ext4_abort(inode->i_sb, __func__, 95 + "error %d when attempting revoke", err); 96 + } 97 + BUFFER_TRACE(bh, "exit"); 98 return err; 99 } 100

+26 -18

fs/ext4/ext4_jbd2.h

··· 49 50 #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ 51 EXT4_XATTR_TRANS_BLOCKS - 2 + \ 52 - 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) 53 54 /* 55 * Define the number of metadata blocks we need to account to modify data. ··· 57 * This include super block, inode block, quota blocks and xattr blocks 58 */ 59 #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ 60 - 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) 61 62 /* Delete operations potentially hit one directory's namespace plus an 63 * entire inode, plus arbitrary amounts of bitmap/indirection data. Be ··· 92 * but inode, sb and group updates are done only once */ 93 #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ 94 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) 95 #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ 96 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) 97 #else ··· 100 #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 101 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0 102 #endif 103 104 int 105 ext4_mark_iloc_dirty(handle_t *handle, ··· 120 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); 121 122 /* 123 - * Wrapper functions with which ext4 calls into JBD. The intent here is 124 - * to allow these to be turned into appropriate stubs so ext4 can control 125 - * ext2 filesystems, so ext2+ext4 systems only nee one fs. This work hasn't 126 - * been done yet. 127 */ 128 - 129 void ext4_journal_abort_handle(const char *caller, const char *err_fn, 130 struct buffer_head *bh, handle_t *handle, int err); 131 ··· 131 int __ext4_journal_get_write_access(const char *where, handle_t *handle, 132 struct buffer_head *bh); 133 134 - /* When called with an invalid handle, this will still do a put on the BH */ 135 - int __ext4_journal_forget(const char *where, handle_t *handle, 136 - struct buffer_head *bh); 137 - 138 - /* When called with an invalid handle, this will still do a put on the BH */ 139 - int __ext4_journal_revoke(const char *where, handle_t *handle, 140 - ext4_fsblk_t blocknr, struct buffer_head *bh); 141 142 int __ext4_journal_get_create_access(const char *where, 143 handle_t *handle, struct buffer_head *bh); ··· 145 __ext4_journal_get_undo_access(__func__, (handle), (bh)) 146 #define ext4_journal_get_write_access(handle, bh) \ 147 __ext4_journal_get_write_access(__func__, (handle), (bh)) 148 - #define ext4_journal_revoke(handle, blocknr, bh) \ 149 - __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) 150 #define ext4_journal_get_create_access(handle, bh) \ 151 __ext4_journal_get_create_access(__func__, (handle), (bh)) 152 - #define ext4_journal_forget(handle, bh) \ 153 - __ext4_journal_forget(__func__, (handle), (bh)) 154 #define ext4_handle_dirty_metadata(handle, inode, bh) \ 155 __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh)) 156 ··· 247 if (ext4_handle_valid(handle)) 248 return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); 249 return 0; 250 } 251 252 /* super.c */

··· 49 50 #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ 51 EXT4_XATTR_TRANS_BLOCKS - 2 + \ 52 + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) 53 54 /* 55 * Define the number of metadata blocks we need to account to modify data. ··· 57 * This include super block, inode block, quota blocks and xattr blocks 58 */ 59 #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ 60 + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) 61 62 /* Delete operations potentially hit one directory's namespace plus an 63 * entire inode, plus arbitrary amounts of bitmap/indirection data. Be ··· 92 * but inode, sb and group updates are done only once */ 93 #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ 94 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) 95 + 96 #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ 97 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) 98 #else ··· 99 #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 100 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0 101 #endif 102 + #define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) 103 + #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) 104 + #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) 105 106 int 107 ext4_mark_iloc_dirty(handle_t *handle, ··· 116 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); 117 118 /* 119 + * Wrapper functions with which ext4 calls into JBD. 120 */ 121 void ext4_journal_abort_handle(const char *caller, const char *err_fn, 122 struct buffer_head *bh, handle_t *handle, int err); 123 ··· 131 int __ext4_journal_get_write_access(const char *where, handle_t *handle, 132 struct buffer_head *bh); 133 134 + int __ext4_forget(const char *where, handle_t *handle, int is_metadata, 135 + struct inode *inode, struct buffer_head *bh, 136 + ext4_fsblk_t blocknr); 137 138 int __ext4_journal_get_create_access(const char *where, 139 handle_t *handle, struct buffer_head *bh); ··· 149 __ext4_journal_get_undo_access(__func__, (handle), (bh)) 150 #define ext4_journal_get_write_access(handle, bh) \ 151 __ext4_journal_get_write_access(__func__, (handle), (bh)) 152 + #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ 153 + __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\ 154 + (block_nr)) 155 #define ext4_journal_get_create_access(handle, bh) \ 156 __ext4_journal_get_create_access(__func__, (handle), (bh)) 157 #define ext4_handle_dirty_metadata(handle, inode, bh) \ 158 __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh)) 159 ··· 252 if (ext4_handle_valid(handle)) 253 return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); 254 return 0; 255 + } 256 + 257 + static inline void ext4_update_inode_fsync_trans(handle_t *handle, 258 + struct inode *inode, 259 + int datasync) 260 + { 261 + struct ext4_inode_info *ei = EXT4_I(inode); 262 + 263 + if (ext4_handle_valid(handle)) { 264 + ei->i_sync_tid = handle->h_transaction->t_tid; 265 + if (datasync) 266 + ei->i_datasync_tid = handle->h_transaction->t_tid; 267 + } 268 } 269 270 /* super.c */

+24 -20

fs/ext4/extents.c

··· 1007 for (i = 0; i < depth; i++) { 1008 if (!ablocks[i]) 1009 continue; 1010 - ext4_free_blocks(handle, inode, ablocks[i], 1, 1); 1011 } 1012 } 1013 kfree(ablocks); ··· 1762 while (block < last && block != EXT_MAX_BLOCK) { 1763 num = last - block; 1764 /* find extent for this block */ 1765 path = ext4_ext_find_extent(inode, block, path); 1766 if (IS_ERR(path)) { 1767 err = PTR_ERR(path); 1768 path = NULL; ··· 1960 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 1961 struct ext4_ext_path *path) 1962 { 1963 - struct buffer_head *bh; 1964 int err; 1965 ext4_fsblk_t leaf; 1966 ··· 1975 if (err) 1976 return err; 1977 ext_debug("index is empty, remove it, free block %llu\n", leaf); 1978 - bh = sb_find_get_block(inode->i_sb, leaf); 1979 - ext4_forget(handle, 1, inode, bh, leaf); 1980 - ext4_free_blocks(handle, inode, leaf, 1, 1); 1981 return err; 1982 } 1983 ··· 2043 struct ext4_extent *ex, 2044 ext4_lblk_t from, ext4_lblk_t to) 2045 { 2046 - struct buffer_head *bh; 2047 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2048 - int i, metadata = 0; 2049 2050 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2051 - metadata = 1; 2052 #ifdef EXTENTS_STATS 2053 { 2054 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ··· 2072 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2073 start = ext_pblock(ex) + ee_len - num; 2074 ext_debug("free last %u blocks starting %llu\n", num, start); 2075 - for (i = 0; i < num; i++) { 2076 - bh = sb_find_get_block(inode->i_sb, start + i); 2077 - ext4_forget(handle, 0, inode, bh, start + i); 2078 - } 2079 - ext4_free_blocks(handle, inode, start, num, metadata); 2080 } else if (from == le32_to_cpu(ex->ee_block) 2081 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2082 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", ··· 2163 correct_index = 1; 2164 credits += (ext_depth(inode)) + 1; 2165 } 2166 - credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 2167 2168 err = ext4_ext_truncate_extend_restart(handle, inode, credits); 2169 if (err) ··· 3060 if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { 3061 ret = ext4_convert_unwritten_extents_dio(handle, inode, 3062 path); 3063 goto out2; 3064 } 3065 /* buffered IO case */ ··· 3089 ret = ext4_ext_convert_to_initialized(handle, inode, 3090 path, iblock, 3091 max_blocks); 3092 out: 3093 if (ret <= 0) { 3094 err = ret; ··· 3319 /* not a good idea to call discard here directly, 3320 * but otherwise we'd need to call it every free() */ 3321 ext4_discard_preallocations(inode); 3322 - ext4_free_blocks(handle, inode, ext_pblock(&newex), 3323 - ext4_ext_get_actual_len(&newex), 0); 3324 goto out2; 3325 } 3326 ··· 3329 allocated = ext4_ext_get_actual_len(&newex); 3330 set_buffer_new(bh_result); 3331 3332 - /* Cache only when it is _not_ an uninitialized extent */ 3333 - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) 3334 ext4_ext_put_in_cache(inode, iblock, allocated, newblock, 3335 EXT4_EXT_CACHE_EXTENT); 3336 out: 3337 if (allocated > max_blocks) 3338 allocated = max_blocks; ··· 3726 * Walk the extent tree gathering extent information. 3727 * ext4_ext_fiemap_cb will push extents back to user. 3728 */ 3729 - down_read(&EXT4_I(inode)->i_data_sem); 3730 error = ext4_ext_walk_space(inode, start_blk, len_blks, 3731 ext4_ext_fiemap_cb, fieinfo); 3732 - up_read(&EXT4_I(inode)->i_data_sem); 3733 } 3734 3735 return error;

··· 1007 for (i = 0; i < depth; i++) { 1008 if (!ablocks[i]) 1009 continue; 1010 + ext4_free_blocks(handle, inode, 0, ablocks[i], 1, 1011 + EXT4_FREE_BLOCKS_METADATA); 1012 } 1013 } 1014 kfree(ablocks); ··· 1761 while (block < last && block != EXT_MAX_BLOCK) { 1762 num = last - block; 1763 /* find extent for this block */ 1764 + down_read(&EXT4_I(inode)->i_data_sem); 1765 path = ext4_ext_find_extent(inode, block, path); 1766 + up_read(&EXT4_I(inode)->i_data_sem); 1767 if (IS_ERR(path)) { 1768 err = PTR_ERR(path); 1769 path = NULL; ··· 1957 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 1958 struct ext4_ext_path *path) 1959 { 1960 int err; 1961 ext4_fsblk_t leaf; 1962 ··· 1973 if (err) 1974 return err; 1975 ext_debug("index is empty, remove it, free block %llu\n", leaf); 1976 + ext4_free_blocks(handle, inode, 0, leaf, 1, 1977 + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 1978 return err; 1979 } 1980 ··· 2042 struct ext4_extent *ex, 2043 ext4_lblk_t from, ext4_lblk_t to) 2044 { 2045 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2046 + int flags = EXT4_FREE_BLOCKS_FORGET; 2047 2048 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2049 + flags |= EXT4_FREE_BLOCKS_METADATA; 2050 #ifdef EXTENTS_STATS 2051 { 2052 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ··· 2072 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2073 start = ext_pblock(ex) + ee_len - num; 2074 ext_debug("free last %u blocks starting %llu\n", num, start); 2075 + ext4_free_blocks(handle, inode, 0, start, num, flags); 2076 } else if (from == le32_to_cpu(ex->ee_block) 2077 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2078 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", ··· 2167 correct_index = 1; 2168 credits += (ext_depth(inode)) + 1; 2169 } 2170 + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); 2171 2172 err = ext4_ext_truncate_extend_restart(handle, inode, credits); 2173 if (err) ··· 3064 if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { 3065 ret = ext4_convert_unwritten_extents_dio(handle, inode, 3066 path); 3067 + if (ret >= 0) 3068 + ext4_update_inode_fsync_trans(handle, inode, 1); 3069 goto out2; 3070 } 3071 /* buffered IO case */ ··· 3091 ret = ext4_ext_convert_to_initialized(handle, inode, 3092 path, iblock, 3093 max_blocks); 3094 + if (ret >= 0) 3095 + ext4_update_inode_fsync_trans(handle, inode, 1); 3096 out: 3097 if (ret <= 0) { 3098 err = ret; ··· 3319 /* not a good idea to call discard here directly, 3320 * but otherwise we'd need to call it every free() */ 3321 ext4_discard_preallocations(inode); 3322 + ext4_free_blocks(handle, inode, 0, ext_pblock(&newex), 3323 + ext4_ext_get_actual_len(&newex), 0); 3324 goto out2; 3325 } 3326 ··· 3329 allocated = ext4_ext_get_actual_len(&newex); 3330 set_buffer_new(bh_result); 3331 3332 + /* 3333 + * Cache the extent and update transaction to commit on fdatasync only 3334 + * when it is _not_ an uninitialized extent. 3335 + */ 3336 + if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { 3337 ext4_ext_put_in_cache(inode, iblock, allocated, newblock, 3338 EXT4_EXT_CACHE_EXTENT); 3339 + ext4_update_inode_fsync_trans(handle, inode, 1); 3340 + } else 3341 + ext4_update_inode_fsync_trans(handle, inode, 0); 3342 out: 3343 if (allocated > max_blocks) 3344 allocated = max_blocks; ··· 3720 * Walk the extent tree gathering extent information. 3721 * ext4_ext_fiemap_cb will push extents back to user. 3722 */ 3723 error = ext4_ext_walk_space(inode, start_blk, len_blks, 3724 ext4_ext_fiemap_cb, fieinfo); 3725 } 3726 3727 return error;

+20 -34

fs/ext4/fsync.c

··· 51 int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) 52 { 53 struct inode *inode = dentry->d_inode; 54 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 55 - int err, ret = 0; 56 57 J_ASSERT(ext4_journal_current_handle() == NULL); 58 59 trace_ext4_sync_file(file, dentry, datasync); 60 61 ret = flush_aio_dio_completed_IO(inode); 62 if (ret < 0) 63 - goto out; 64 /* 65 - * data=writeback: 66 * The caller's filemap_fdatawrite()/wait will sync the data. 67 - * sync_inode() will sync the metadata 68 - * 69 - * data=ordered: 70 - * The caller's filemap_fdatawrite() will write the data and 71 - * sync_inode() will write the inode if it is dirty. Then the caller's 72 - * filemap_fdatawait() will wait on the pages. 73 * 74 * data=journal: 75 * filemap_fdatawrite won't do anything (the buffers are clean). ··· 84 * (they were dirtied by commit). But that's OK - the blocks are 85 * safe in-journal, which is all fsync() needs to ensure. 86 */ 87 - if (ext4_should_journal_data(inode)) { 88 - ret = ext4_force_commit(inode->i_sb); 89 - goto out; 90 - } 91 92 - if (!journal) 93 - ret = sync_mapping_buffers(inode->i_mapping); 94 - 95 - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 96 - goto out; 97 - 98 - /* 99 - * The VFS has written the file data. If the inode is unaltered 100 - * then we need not start a commit. 101 - */ 102 - if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { 103 - struct writeback_control wbc = { 104 - .sync_mode = WB_SYNC_ALL, 105 - .nr_to_write = 0, /* sys_fsync did this */ 106 - }; 107 - err = sync_inode(inode, &wbc); 108 - if (ret == 0) 109 - ret = err; 110 - } 111 - out: 112 - if (journal && (journal->j_flags & JBD2_BARRIER)) 113 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 114 return ret; 115 }

··· 51 int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) 52 { 53 struct inode *inode = dentry->d_inode; 54 + struct ext4_inode_info *ei = EXT4_I(inode); 55 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 56 + int ret; 57 + tid_t commit_tid; 58 59 J_ASSERT(ext4_journal_current_handle() == NULL); 60 61 trace_ext4_sync_file(file, dentry, datasync); 62 63 + if (inode->i_sb->s_flags & MS_RDONLY) 64 + return 0; 65 + 66 ret = flush_aio_dio_completed_IO(inode); 67 if (ret < 0) 68 + return ret; 69 + 70 + if (!journal) 71 + return simple_fsync(file, dentry, datasync); 72 + 73 /* 74 + * data=writeback,ordered: 75 * The caller's filemap_fdatawrite()/wait will sync the data. 76 + * Metadata is in the journal, we wait for proper transaction to 77 + * commit here. 78 * 79 * data=journal: 80 * filemap_fdatawrite won't do anything (the buffers are clean). ··· 79 * (they were dirtied by commit). But that's OK - the blocks are 80 * safe in-journal, which is all fsync() needs to ensure. 81 */ 82 + if (ext4_should_journal_data(inode)) 83 + return ext4_force_commit(inode->i_sb); 84 85 + commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; 86 + if (jbd2_log_start_commit(journal, commit_tid)) 87 + jbd2_log_wait_commit(journal, commit_tid); 88 + else if (journal->j_flags & JBD2_BARRIER) 89 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 90 return ret; 91 }

+85 -108

fs/ext4/inode.c

··· 71 } 72 73 /* 74 - * The ext4 forget function must perform a revoke if we are freeing data 75 - * which has been journaled. Metadata (eg. indirect blocks) must be 76 - * revoked in all cases. 77 - * 78 - * "bh" may be NULL: a metadata block may have been freed from memory 79 - * but there may still be a record of it in the journal, and that record 80 - * still needs to be revoked. 81 - * 82 - * If the handle isn't valid we're not journaling, but we still need to 83 - * call into ext4_journal_revoke() to put the buffer head. 84 - */ 85 - int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 86 - struct buffer_head *bh, ext4_fsblk_t blocknr) 87 - { 88 - int err; 89 - 90 - might_sleep(); 91 - 92 - BUFFER_TRACE(bh, "enter"); 93 - 94 - jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " 95 - "data mode %x\n", 96 - bh, is_metadata, inode->i_mode, 97 - test_opt(inode->i_sb, DATA_FLAGS)); 98 - 99 - /* Never use the revoke function if we are doing full data 100 - * journaling: there is no need to, and a V1 superblock won't 101 - * support it. Otherwise, only skip the revoke on un-journaled 102 - * data blocks. */ 103 - 104 - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || 105 - (!is_metadata && !ext4_should_journal_data(inode))) { 106 - if (bh) { 107 - BUFFER_TRACE(bh, "call jbd2_journal_forget"); 108 - return ext4_journal_forget(handle, bh); 109 - } 110 - return 0; 111 - } 112 - 113 - /* 114 - * data!=journal && (is_metadata || should_journal_data(inode)) 115 - */ 116 - BUFFER_TRACE(bh, "call ext4_journal_revoke"); 117 - err = ext4_journal_revoke(handle, blocknr, bh); 118 - if (err) 119 - ext4_abort(inode->i_sb, __func__, 120 - "error %d when attempting revoke", err); 121 - BUFFER_TRACE(bh, "exit"); 122 - return err; 123 - } 124 - 125 - /* 126 * Work out how many blocks we need to proceed with the next chunk of a 127 * truncate transaction. 128 */ ··· 669 return ret; 670 failed_out: 671 for (i = 0; i < index; i++) 672 - ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 673 return ret; 674 } 675 ··· 765 return err; 766 failed: 767 /* Allocation failed, free what we already allocated */ 768 for (i = 1; i <= n ; i++) { 769 - BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); 770 - ext4_journal_forget(handle, branch[i].bh); 771 } 772 - for (i = 0; i < indirect_blks; i++) 773 - ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 774 775 - ext4_free_blocks(handle, inode, new_blocks[i], num, 0); 776 777 return err; 778 } ··· 857 858 err_out: 859 for (i = 1; i <= num; i++) { 860 - BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); 861 - ext4_journal_forget(handle, where[i].bh); 862 - ext4_free_blocks(handle, inode, 863 - le32_to_cpu(where[i-1].key), 1, 0); 864 } 865 - ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0); 866 867 return err; 868 } ··· 979 if (!err) 980 err = ext4_splice_branch(handle, inode, iblock, 981 partial, indirect_blks, count); 982 - else 983 goto cleanup; 984 985 set_buffer_new(bh_result); 986 got_it: 987 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 988 if (count > blocks_to_boundary) ··· 1012 EXT4_I(inode)->i_reserved_meta_blocks; 1013 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1014 1015 - return total; 1016 } 1017 /* 1018 * Calculate the number of metadata blocks need to reserve ··· 1494 return ext4_journal_get_write_access(handle, bh); 1495 } 1496 1497 static int ext4_write_begin(struct file *file, struct address_space *mapping, 1498 loff_t pos, unsigned len, unsigned flags, 1499 struct page **pagep, void **fsdata) ··· 1569 1570 ext4_journal_stop(handle); 1571 if (pos + len > inode->i_size) { 1572 - ext4_truncate(inode); 1573 /* 1574 * If truncate failed early the inode might 1575 * still be on the orphan list; we need to ··· 1679 ret = ret2; 1680 1681 if (pos + len > inode->i_size) { 1682 - ext4_truncate(inode); 1683 /* 1684 * If truncate failed early the inode might still be 1685 * on the orphan list; we need to make sure the inode ··· 1721 ret = ret2; 1722 1723 if (pos + len > inode->i_size) { 1724 - ext4_truncate(inode); 1725 /* 1726 * If truncate failed early the inode might still be 1727 * on the orphan list; we need to make sure the inode ··· 1784 if (!ret) 1785 ret = ret2; 1786 if (pos + len > inode->i_size) { 1787 - ext4_truncate(inode); 1788 /* 1789 * If truncate failed early the inode might still be 1790 * on the orphan list; we need to make sure the inode ··· 2570 } 2571 2572 static int __ext4_journalled_writepage(struct page *page, 2573 - struct writeback_control *wbc, 2574 unsigned int len) 2575 { 2576 struct address_space *mapping = page->mapping; ··· 2727 * doesn't seem much point in redirtying the page here. 2728 */ 2729 ClearPageChecked(page); 2730 - return __ext4_journalled_writepage(page, wbc, len); 2731 } 2732 2733 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) ··· 2757 * number of contiguous block. So we will limit 2758 * number of contiguous block to a sane value 2759 */ 2760 - if (!(inode->i_flags & EXT4_EXTENTS_FL) && 2761 (max_blocks > EXT4_MAX_TRANS_DATA)) 2762 max_blocks = EXT4_MAX_TRANS_DATA; 2763 ··· 3060 * i_size_read because we hold i_mutex. 3061 */ 3062 if (pos + len > inode->i_size) 3063 - ext4_truncate(inode); 3064 } 3065 3066 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) ··· 4089 __le32 *last) 4090 { 4091 __le32 *p; 4092 if (try_to_extend_transaction(handle, inode)) { 4093 if (bh) { 4094 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); ··· 4108 } 4109 } 4110 4111 - /* 4112 - * Any buffers which are on the journal will be in memory. We 4113 - * find them on the hash table so jbd2_journal_revoke() will 4114 - * run jbd2_journal_forget() on them. We've already detached 4115 - * each block from the file, so bforget() in 4116 - * jbd2_journal_forget() should be safe. 4117 - * 4118 - * AKPM: turn on bforget in jbd2_journal_forget()!!! 4119 - */ 4120 - for (p = first; p < last; p++) { 4121 - u32 nr = le32_to_cpu(*p); 4122 - if (nr) { 4123 - struct buffer_head *tbh; 4124 4125 - *p = 0; 4126 - tbh = sb_find_get_block(inode->i_sb, nr); 4127 - ext4_forget(handle, 0, inode, tbh, nr); 4128 - } 4129 - } 4130 - 4131 - ext4_free_blocks(handle, inode, block_to_free, count, 0); 4132 } 4133 4134 /** ··· 4299 blocks_for_truncate(inode)); 4300 } 4301 4302 - ext4_free_blocks(handle, inode, nr, 1, 1); 4303 4304 if (parent_bh) { 4305 /* ··· 4739 struct ext4_iloc iloc; 4740 struct ext4_inode *raw_inode; 4741 struct ext4_inode_info *ei; 4742 - struct buffer_head *bh; 4743 struct inode *inode; 4744 long ret; 4745 int block; 4746 ··· 4751 return inode; 4752 4753 ei = EXT4_I(inode); 4754 4755 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4756 if (ret < 0) 4757 goto bad_inode; 4758 - bh = iloc.bh; 4759 raw_inode = ext4_raw_inode(&iloc); 4760 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4761 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); ··· 4778 if (inode->i_mode == 0 || 4779 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 4780 /* this inode is deleted */ 4781 - brelse(bh); 4782 ret = -ESTALE; 4783 goto bad_inode; 4784 } ··· 4805 ei->i_data[block] = raw_inode->i_block[block]; 4806 INIT_LIST_HEAD(&ei->i_orphan); 4807 4808 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4809 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 4810 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 4811 EXT4_INODE_SIZE(inode->i_sb)) { 4812 - brelse(bh); 4813 ret = -EIO; 4814 goto bad_inode; 4815 } ··· 4865 4866 ret = 0; 4867 if (ei->i_file_acl && 4868 - ((ei->i_file_acl < 4869 - (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + 4870 - EXT4_SB(sb)->s_gdb_count)) || 4871 - (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) { 4872 ext4_error(sb, __func__, 4873 "bad extended attribute block %llu in inode #%lu", 4874 ei->i_file_acl, inode->i_ino); ··· 4883 /* Validate block references which are part of inode */ 4884 ret = ext4_check_inode_blockref(inode); 4885 } 4886 - if (ret) { 4887 - brelse(bh); 4888 goto bad_inode; 4889 - } 4890 4891 if (S_ISREG(inode->i_mode)) { 4892 inode->i_op = &ext4_file_inode_operations; ··· 4912 init_special_inode(inode, inode->i_mode, 4913 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4914 } else { 4915 - brelse(bh); 4916 ret = -EIO; 4917 ext4_error(inode->i_sb, __func__, 4918 "bogus i_mode (%o) for inode=%lu", ··· 4924 return inode; 4925 4926 bad_inode: 4927 iget_failed(inode); 4928 return ERR_PTR(ret); 4929 } ··· 5084 err = rc; 5085 ei->i_state &= ~EXT4_STATE_NEW; 5086 5087 out_brelse: 5088 brelse(bh); 5089 ext4_std_error(inode->i_sb, err); ··· 5204 5205 /* (user+group)*(old+new) structure, inode write (sb, 5206 * inode block, ? - but truncate inode update has it) */ 5207 - handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ 5208 - EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); 5209 if (IS_ERR(handle)) { 5210 error = PTR_ERR(handle); 5211 goto err_out;

··· 71 } 72 73 /* 74 * Work out how many blocks we need to proceed with the next chunk of a 75 * truncate transaction. 76 */ ··· 721 return ret; 722 failed_out: 723 for (i = 0; i < index; i++) 724 + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); 725 return ret; 726 } 727 ··· 817 return err; 818 failed: 819 /* Allocation failed, free what we already allocated */ 820 + ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); 821 for (i = 1; i <= n ; i++) { 822 + /* 823 + * branch[i].bh is newly allocated, so there is no 824 + * need to revoke the block, which is why we don't 825 + * need to set EXT4_FREE_BLOCKS_METADATA. 826 + */ 827 + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 828 + EXT4_FREE_BLOCKS_FORGET); 829 } 830 + for (i = n+1; i < indirect_blks; i++) 831 + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); 832 833 + ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0); 834 835 return err; 836 } ··· 903 904 err_out: 905 for (i = 1; i <= num; i++) { 906 + /* 907 + * branch[i].bh is newly allocated, so there is no 908 + * need to revoke the block, which is why we don't 909 + * need to set EXT4_FREE_BLOCKS_METADATA. 910 + */ 911 + ext4_free_blocks(handle, inode, where[i].bh, 0, 1, 912 + EXT4_FREE_BLOCKS_FORGET); 913 } 914 + ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key), 915 + blks, 0); 916 917 return err; 918 } ··· 1021 if (!err) 1022 err = ext4_splice_branch(handle, inode, iblock, 1023 partial, indirect_blks, count); 1024 + if (err) 1025 goto cleanup; 1026 1027 set_buffer_new(bh_result); 1028 + 1029 + ext4_update_inode_fsync_trans(handle, inode, 1); 1030 got_it: 1031 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 1032 if (count > blocks_to_boundary) ··· 1052 EXT4_I(inode)->i_reserved_meta_blocks; 1053 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1054 1055 + return (total << inode->i_blkbits); 1056 } 1057 /* 1058 * Calculate the number of metadata blocks need to reserve ··· 1534 return ext4_journal_get_write_access(handle, bh); 1535 } 1536 1537 + /* 1538 + * Truncate blocks that were not used by write. We have to truncate the 1539 + * pagecache as well so that corresponding buffers get properly unmapped. 1540 + */ 1541 + static void ext4_truncate_failed_write(struct inode *inode) 1542 + { 1543 + truncate_inode_pages(inode->i_mapping, inode->i_size); 1544 + ext4_truncate(inode); 1545 + } 1546 + 1547 static int ext4_write_begin(struct file *file, struct address_space *mapping, 1548 loff_t pos, unsigned len, unsigned flags, 1549 struct page **pagep, void **fsdata) ··· 1599 1600 ext4_journal_stop(handle); 1601 if (pos + len > inode->i_size) { 1602 + ext4_truncate_failed_write(inode); 1603 /* 1604 * If truncate failed early the inode might 1605 * still be on the orphan list; we need to ··· 1709 ret = ret2; 1710 1711 if (pos + len > inode->i_size) { 1712 + ext4_truncate_failed_write(inode); 1713 /* 1714 * If truncate failed early the inode might still be 1715 * on the orphan list; we need to make sure the inode ··· 1751 ret = ret2; 1752 1753 if (pos + len > inode->i_size) { 1754 + ext4_truncate_failed_write(inode); 1755 /* 1756 * If truncate failed early the inode might still be 1757 * on the orphan list; we need to make sure the inode ··· 1814 if (!ret) 1815 ret = ret2; 1816 if (pos + len > inode->i_size) { 1817 + ext4_truncate_failed_write(inode); 1818 /* 1819 * If truncate failed early the inode might still be 1820 * on the orphan list; we need to make sure the inode ··· 2600 } 2601 2602 static int __ext4_journalled_writepage(struct page *page, 2603 unsigned int len) 2604 { 2605 struct address_space *mapping = page->mapping; ··· 2758 * doesn't seem much point in redirtying the page here. 2759 */ 2760 ClearPageChecked(page); 2761 + return __ext4_journalled_writepage(page, len); 2762 } 2763 2764 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) ··· 2788 * number of contiguous block. So we will limit 2789 * number of contiguous block to a sane value 2790 */ 2791 + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) && 2792 (max_blocks > EXT4_MAX_TRANS_DATA)) 2793 max_blocks = EXT4_MAX_TRANS_DATA; 2794 ··· 3091 * i_size_read because we hold i_mutex. 3092 */ 3093 if (pos + len > inode->i_size) 3094 + ext4_truncate_failed_write(inode); 3095 } 3096 3097 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) ··· 4120 __le32 *last) 4121 { 4122 __le32 *p; 4123 + int flags = EXT4_FREE_BLOCKS_FORGET; 4124 + 4125 + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 4126 + flags |= EXT4_FREE_BLOCKS_METADATA; 4127 + 4128 if (try_to_extend_transaction(handle, inode)) { 4129 if (bh) { 4130 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); ··· 4134 } 4135 } 4136 4137 + for (p = first; p < last; p++) 4138 + *p = 0; 4139 4140 + ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); 4141 } 4142 4143 /** ··· 4342 blocks_for_truncate(inode)); 4343 } 4344 4345 + ext4_free_blocks(handle, inode, 0, nr, 1, 4346 + EXT4_FREE_BLOCKS_METADATA); 4347 4348 if (parent_bh) { 4349 /* ··· 4781 struct ext4_iloc iloc; 4782 struct ext4_inode *raw_inode; 4783 struct ext4_inode_info *ei; 4784 struct inode *inode; 4785 + journal_t *journal = EXT4_SB(sb)->s_journal; 4786 long ret; 4787 int block; 4788 ··· 4793 return inode; 4794 4795 ei = EXT4_I(inode); 4796 + iloc.bh = 0; 4797 4798 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4799 if (ret < 0) 4800 goto bad_inode; 4801 raw_inode = ext4_raw_inode(&iloc); 4802 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4803 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); ··· 4820 if (inode->i_mode == 0 || 4821 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 4822 /* this inode is deleted */ 4823 ret = -ESTALE; 4824 goto bad_inode; 4825 } ··· 4848 ei->i_data[block] = raw_inode->i_block[block]; 4849 INIT_LIST_HEAD(&ei->i_orphan); 4850 4851 + /* 4852 + * Set transaction id's of transactions that have to be committed 4853 + * to finish f[data]sync. We set them to currently running transaction 4854 + * as we cannot be sure that the inode or some of its metadata isn't 4855 + * part of the transaction - the inode could have been reclaimed and 4856 + * now it is reread from disk. 4857 + */ 4858 + if (journal) { 4859 + transaction_t *transaction; 4860 + tid_t tid; 4861 + 4862 + spin_lock(&journal->j_state_lock); 4863 + if (journal->j_running_transaction) 4864 + transaction = journal->j_running_transaction; 4865 + else 4866 + transaction = journal->j_committing_transaction; 4867 + if (transaction) 4868 + tid = transaction->t_tid; 4869 + else 4870 + tid = journal->j_commit_sequence; 4871 + spin_unlock(&journal->j_state_lock); 4872 + ei->i_sync_tid = tid; 4873 + ei->i_datasync_tid = tid; 4874 + } 4875 + 4876 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4877 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 4878 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 4879 EXT4_INODE_SIZE(inode->i_sb)) { 4880 ret = -EIO; 4881 goto bad_inode; 4882 } ··· 4884 4885 ret = 0; 4886 if (ei->i_file_acl && 4887 + !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { 4888 ext4_error(sb, __func__, 4889 "bad extended attribute block %llu in inode #%lu", 4890 ei->i_file_acl, inode->i_ino); ··· 4905 /* Validate block references which are part of inode */ 4906 ret = ext4_check_inode_blockref(inode); 4907 } 4908 + if (ret) 4909 goto bad_inode; 4910 4911 if (S_ISREG(inode->i_mode)) { 4912 inode->i_op = &ext4_file_inode_operations; ··· 4936 init_special_inode(inode, inode->i_mode, 4937 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4938 } else { 4939 ret = -EIO; 4940 ext4_error(inode->i_sb, __func__, 4941 "bogus i_mode (%o) for inode=%lu", ··· 4949 return inode; 4950 4951 bad_inode: 4952 + brelse(iloc.bh); 4953 iget_failed(inode); 4954 return ERR_PTR(ret); 4955 } ··· 5108 err = rc; 5109 ei->i_state &= ~EXT4_STATE_NEW; 5110 5111 + ext4_update_inode_fsync_trans(handle, inode, 0); 5112 out_brelse: 5113 brelse(bh); 5114 ext4_std_error(inode->i_sb, err); ··· 5227 5228 /* (user+group)*(old+new) structure, inode write (sb, 5229 * inode block, ? - but truncate inode update has it) */ 5230 + handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ 5231 + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); 5232 if (IS_ERR(handle)) { 5233 error = PTR_ERR(handle); 5234 goto err_out;

+18 -11

fs/ext4/ioctl.c

··· 221 struct file *donor_filp; 222 int err; 223 224 if (copy_from_user(&me, 225 (struct move_extent __user *)arg, sizeof(me))) 226 return -EFAULT; 227 228 donor_filp = fget(me.donor_fd); 229 if (!donor_filp) 230 return -EBADF; 231 232 - if (!capable(CAP_DAC_OVERRIDE)) { 233 - if ((current->real_cred->fsuid != inode->i_uid) || 234 - !(inode->i_mode & S_IRUSR) || 235 - !(donor_filp->f_dentry->d_inode->i_mode & 236 - S_IRUSR)) { 237 - fput(donor_filp); 238 - return -EACCES; 239 - } 240 } 241 242 err = ext4_move_extents(filp, donor_filp, me.orig_start, 243 me.donor_start, me.len, &me.moved_len); 244 - fput(donor_filp); 245 246 if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) 247 - return -EFAULT; 248 - 249 return err; 250 } 251

··· 221 struct file *donor_filp; 222 int err; 223 224 + if (!(filp->f_mode & FMODE_READ) || 225 + !(filp->f_mode & FMODE_WRITE)) 226 + return -EBADF; 227 + 228 if (copy_from_user(&me, 229 (struct move_extent __user *)arg, sizeof(me))) 230 return -EFAULT; 231 + me.moved_len = 0; 232 233 donor_filp = fget(me.donor_fd); 234 if (!donor_filp) 235 return -EBADF; 236 237 + if (!(donor_filp->f_mode & FMODE_WRITE)) { 238 + err = -EBADF; 239 + goto mext_out; 240 } 241 + 242 + err = mnt_want_write(filp->f_path.mnt); 243 + if (err) 244 + goto mext_out; 245 246 err = ext4_move_extents(filp, donor_filp, me.orig_start, 247 me.donor_start, me.len, &me.moved_len); 248 + mnt_drop_write(filp->f_path.mnt); 249 + if (me.moved_len > 0) 250 + file_remove_suid(donor_filp); 251 252 if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) 253 + err = -EFAULT; 254 + mext_out: 255 + fput(donor_filp); 256 return err; 257 } 258

+80 -19

fs/ext4/mballoc.c

··· 2529 struct ext4_group_info *db; 2530 int err, count = 0, count2 = 0; 2531 struct ext4_free_data *entry; 2532 - ext4_fsblk_t discard_block; 2533 struct list_head *l, *ltmp; 2534 2535 list_for_each_safe(l, ltmp, &txn->t_private_list) { ··· 2558 page_cache_release(e4b.bd_bitmap_page); 2559 } 2560 ext4_unlock_group(sb, entry->group); 2561 - discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) 2562 - + entry->start_blk 2563 - + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 2564 - trace_ext4_discard_blocks(sb, (unsigned long long)discard_block, 2565 - entry->count); 2566 - sb_issue_discard(sb, discard_block, entry->count); 2567 2568 kmem_cache_free(ext4_free_ext_cachep, entry); 2569 ext4_mb_release_desc(&e4b); 2570 } ··· 3008 trace_ext4_mballoc_alloc(ac); 3009 else 3010 trace_ext4_mballoc_prealloc(ac); 3011 } 3012 3013 /* ··· 4313 ac->ac_status = AC_STATUS_CONTINUE; 4314 goto repeat; 4315 } else if (*errp) { 4316 ac->ac_b_ex.fe_len = 0; 4317 ar->len = 0; 4318 ext4_mb_show_ac(ac); ··· 4446 return 0; 4447 } 4448 4449 - /* 4450 - * Main entry point into mballoc to free blocks 4451 */ 4452 - void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, 4453 - ext4_fsblk_t block, unsigned long count, 4454 - int metadata, unsigned long *freed) 4455 { 4456 struct buffer_head *bitmap_bh = NULL; 4457 struct super_block *sb = inode->i_sb; 4458 struct ext4_allocation_context *ac = NULL; 4459 struct ext4_group_desc *gdp; 4460 struct ext4_super_block *es; 4461 unsigned int overflow; 4462 ext4_grpblk_t bit; 4463 struct buffer_head *gd_bh; ··· 4473 int err = 0; 4474 int ret; 4475 4476 - *freed = 0; 4477 4478 sbi = EXT4_SB(sb); 4479 es = EXT4_SB(sb)->s_es; 4480 - if (block < le32_to_cpu(es->s_first_data_block) || 4481 - block + count < block || 4482 - block + count > ext4_blocks_count(es)) { 4483 ext4_error(sb, __func__, 4484 "Freeing blocks not in datazone - " 4485 "block = %llu, count = %lu", block, count); ··· 4490 } 4491 4492 ext4_debug("freeing block %llu\n", block); 4493 - trace_ext4_free_blocks(inode, block, count, metadata); 4494 4495 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4496 if (ac) { ··· 4591 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4592 if (err) 4593 goto error_return; 4594 - if (metadata && ext4_handle_valid(handle)) { 4595 struct ext4_free_data *new_entry; 4596 /* 4597 * blocks being freed are metadata. these blocks shouldn't ··· 4631 4632 ext4_mb_release_desc(&e4b); 4633 4634 - *freed += count; 4635 4636 /* We dirtied the bitmap block */ 4637 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); ··· 4651 } 4652 sb->s_dirt = 1; 4653 error_return: 4654 brelse(bitmap_bh); 4655 ext4_std_error(sb, err); 4656 if (ac)

··· 2529 struct ext4_group_info *db; 2530 int err, count = 0, count2 = 0; 2531 struct ext4_free_data *entry; 2532 struct list_head *l, *ltmp; 2533 2534 list_for_each_safe(l, ltmp, &txn->t_private_list) { ··· 2559 page_cache_release(e4b.bd_bitmap_page); 2560 } 2561 ext4_unlock_group(sb, entry->group); 2562 + if (test_opt(sb, DISCARD)) { 2563 + ext4_fsblk_t discard_block; 2564 + struct ext4_super_block *es = EXT4_SB(sb)->s_es; 2565 2566 + discard_block = (ext4_fsblk_t)entry->group * 2567 + EXT4_BLOCKS_PER_GROUP(sb) 2568 + + entry->start_blk 2569 + + le32_to_cpu(es->s_first_data_block); 2570 + trace_ext4_discard_blocks(sb, 2571 + (unsigned long long)discard_block, 2572 + entry->count); 2573 + sb_issue_discard(sb, discard_block, entry->count); 2574 + } 2575 kmem_cache_free(ext4_free_ext_cachep, entry); 2576 ext4_mb_release_desc(&e4b); 2577 } ··· 3003 trace_ext4_mballoc_alloc(ac); 3004 else 3005 trace_ext4_mballoc_prealloc(ac); 3006 + } 3007 + 3008 + /* 3009 + * Called on failure; free up any blocks from the inode PA for this 3010 + * context. We don't need this for MB_GROUP_PA because we only change 3011 + * pa_free in ext4_mb_release_context(), but on failure, we've already 3012 + * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. 3013 + */ 3014 + static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) 3015 + { 3016 + struct ext4_prealloc_space *pa = ac->ac_pa; 3017 + int len; 3018 + 3019 + if (pa && pa->pa_type == MB_INODE_PA) { 3020 + len = ac->ac_b_ex.fe_len; 3021 + pa->pa_free += len; 3022 + } 3023 + 3024 } 3025 3026 /* ··· 4290 ac->ac_status = AC_STATUS_CONTINUE; 4291 goto repeat; 4292 } else if (*errp) { 4293 + ext4_discard_allocated_blocks(ac); 4294 ac->ac_b_ex.fe_len = 0; 4295 ar->len = 0; 4296 ext4_mb_show_ac(ac); ··· 4422 return 0; 4423 } 4424 4425 + /** 4426 + * ext4_free_blocks() -- Free given blocks and update quota 4427 + * @handle: handle for this transaction 4428 + * @inode: inode 4429 + * @block: start physical block to free 4430 + * @count: number of blocks to count 4431 + * @metadata: Are these metadata blocks 4432 */ 4433 + void ext4_free_blocks(handle_t *handle, struct inode *inode, 4434 + struct buffer_head *bh, ext4_fsblk_t block, 4435 + unsigned long count, int flags) 4436 { 4437 struct buffer_head *bitmap_bh = NULL; 4438 struct super_block *sb = inode->i_sb; 4439 struct ext4_allocation_context *ac = NULL; 4440 struct ext4_group_desc *gdp; 4441 struct ext4_super_block *es; 4442 + unsigned long freed = 0; 4443 unsigned int overflow; 4444 ext4_grpblk_t bit; 4445 struct buffer_head *gd_bh; ··· 4443 int err = 0; 4444 int ret; 4445 4446 + if (bh) { 4447 + if (block) 4448 + BUG_ON(block != bh->b_blocknr); 4449 + else 4450 + block = bh->b_blocknr; 4451 + } 4452 4453 sbi = EXT4_SB(sb); 4454 es = EXT4_SB(sb)->s_es; 4455 + if (!ext4_data_block_valid(sbi, block, count)) { 4456 ext4_error(sb, __func__, 4457 "Freeing blocks not in datazone - " 4458 "block = %llu, count = %lu", block, count); ··· 4457 } 4458 4459 ext4_debug("freeing block %llu\n", block); 4460 + trace_ext4_free_blocks(inode, block, count, flags); 4461 + 4462 + if (flags & EXT4_FREE_BLOCKS_FORGET) { 4463 + struct buffer_head *tbh = bh; 4464 + int i; 4465 + 4466 + BUG_ON(bh && (count > 1)); 4467 + 4468 + for (i = 0; i < count; i++) { 4469 + if (!bh) 4470 + tbh = sb_find_get_block(inode->i_sb, 4471 + block + i); 4472 + ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4473 + inode, tbh, block + i); 4474 + } 4475 + } 4476 + 4477 + /* 4478 + * We need to make sure we don't reuse the freed block until 4479 + * after the transaction is committed, which we can do by 4480 + * treating the block as metadata, below. We make an 4481 + * exception if the inode is to be written in writeback mode 4482 + * since writeback mode has weak data consistency guarantees. 4483 + */ 4484 + if (!ext4_should_writeback_data(inode)) 4485 + flags |= EXT4_FREE_BLOCKS_METADATA; 4486 4487 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4488 if (ac) { ··· 4533 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4534 if (err) 4535 goto error_return; 4536 + 4537 + if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) { 4538 struct ext4_free_data *new_entry; 4539 /* 4540 * blocks being freed are metadata. these blocks shouldn't ··· 4572 4573 ext4_mb_release_desc(&e4b); 4574 4575 + freed += count; 4576 4577 /* We dirtied the bitmap block */ 4578 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); ··· 4592 } 4593 sb->s_dirt = 1; 4594 error_return: 4595 + if (freed) 4596 + vfs_dq_free_block(inode, freed); 4597 brelse(bitmap_bh); 4598 ext4_std_error(sb, err); 4599 if (ac)

+18 -9

fs/ext4/migrate.c

··· 238 * So allocate a credit of 3. We may update 239 * quota (user and group). 240 */ 241 - needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 242 243 if (ext4_journal_extend(handle, needed) != 0) 244 retval = ext4_journal_restart(handle, needed); ··· 262 for (i = 0; i < max_entries; i++) { 263 if (tmp_idata[i]) { 264 extend_credit_for_blkdel(handle, inode); 265 - ext4_free_blocks(handle, inode, 266 - le32_to_cpu(tmp_idata[i]), 1, 1); 267 } 268 } 269 put_bh(bh); 270 extend_credit_for_blkdel(handle, inode); 271 - ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); 272 return 0; 273 } 274 ··· 301 } 302 put_bh(bh); 303 extend_credit_for_blkdel(handle, inode); 304 - ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); 305 return 0; 306 } 307 ··· 314 /* ei->i_data[EXT4_IND_BLOCK] */ 315 if (i_data[0]) { 316 extend_credit_for_blkdel(handle, inode); 317 - ext4_free_blocks(handle, inode, 318 - le32_to_cpu(i_data[0]), 1, 1); 319 } 320 321 /* ei->i_data[EXT4_DIND_BLOCK] */ ··· 427 } 428 put_bh(bh); 429 extend_credit_for_blkdel(handle, inode); 430 - ext4_free_blocks(handle, inode, block, 1, 1); 431 return retval; 432 } 433 ··· 486 handle = ext4_journal_start(inode, 487 EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 488 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 489 - 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb) 490 + 1); 491 if (IS_ERR(handle)) { 492 retval = PTR_ERR(handle);

··· 238 * So allocate a credit of 3. We may update 239 * quota (user and group). 240 */ 241 + needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); 242 243 if (ext4_journal_extend(handle, needed) != 0) 244 retval = ext4_journal_restart(handle, needed); ··· 262 for (i = 0; i < max_entries; i++) { 263 if (tmp_idata[i]) { 264 extend_credit_for_blkdel(handle, inode); 265 + ext4_free_blocks(handle, inode, 0, 266 + le32_to_cpu(tmp_idata[i]), 1, 267 + EXT4_FREE_BLOCKS_METADATA | 268 + EXT4_FREE_BLOCKS_FORGET); 269 } 270 } 271 put_bh(bh); 272 extend_credit_for_blkdel(handle, inode); 273 + ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, 274 + EXT4_FREE_BLOCKS_METADATA | 275 + EXT4_FREE_BLOCKS_FORGET); 276 return 0; 277 } 278 ··· 297 } 298 put_bh(bh); 299 extend_credit_for_blkdel(handle, inode); 300 + ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, 301 + EXT4_FREE_BLOCKS_METADATA | 302 + EXT4_FREE_BLOCKS_FORGET); 303 return 0; 304 } 305 ··· 308 /* ei->i_data[EXT4_IND_BLOCK] */ 309 if (i_data[0]) { 310 extend_credit_for_blkdel(handle, inode); 311 + ext4_free_blocks(handle, inode, 0, 312 + le32_to_cpu(i_data[0]), 1, 313 + EXT4_FREE_BLOCKS_METADATA | 314 + EXT4_FREE_BLOCKS_FORGET); 315 } 316 317 /* ei->i_data[EXT4_DIND_BLOCK] */ ··· 419 } 420 put_bh(bh); 421 extend_credit_for_blkdel(handle, inode); 422 + ext4_free_blocks(handle, inode, 0, block, 1, 423 + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 424 return retval; 425 } 426 ··· 477 handle = ext4_journal_start(inode, 478 EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 479 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 480 + EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) 481 + 1); 482 if (IS_ERR(handle)) { 483 retval = PTR_ERR(handle);

+133 -149

fs/ext4/move_extent.c

··· 77 mext_next_extent(struct inode *inode, struct ext4_ext_path *path, 78 struct ext4_extent **extent) 79 { 80 int ppos, leaf_ppos = path->p_depth; 81 82 ppos = leaf_ppos; 83 if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { 84 /* leaf block */ 85 *extent = ++path[ppos].p_ext; 86 return 0; 87 } 88 ··· 121 ext_block_hdr(path[cur_ppos+1].p_bh); 122 } 123 124 /* leaf block */ 125 path[leaf_ppos].p_ext = *extent = 126 EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); 127 return 0; 128 } 129 } ··· 166 } 167 168 /** 169 - * mext_double_down_read - Acquire two inodes' read semaphore 170 * 171 * @orig_inode: original inode structure 172 * @donor_inode: donor inode structure 173 - * Acquire read semaphore of the two inodes (orig and donor) by i_ino order. 174 */ 175 static void 176 - mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode) 177 - { 178 - struct inode *first = orig_inode, *second = donor_inode; 179 - 180 - /* 181 - * Use the inode number to provide the stable locking order instead 182 - * of its address, because the C language doesn't guarantee you can 183 - * compare pointers that don't come from the same array. 184 - */ 185 - if (donor_inode->i_ino < orig_inode->i_ino) { 186 - first = donor_inode; 187 - second = orig_inode; 188 - } 189 - 190 - down_read(&EXT4_I(first)->i_data_sem); 191 - down_read(&EXT4_I(second)->i_data_sem); 192 - } 193 - 194 - /** 195 - * mext_double_down_write - Acquire two inodes' write semaphore 196 - * 197 - * @orig_inode: original inode structure 198 - * @donor_inode: donor inode structure 199 - * Acquire write semaphore of the two inodes (orig and donor) by i_ino order. 200 - */ 201 - static void 202 - mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) 203 { 204 struct inode *first = orig_inode, *second = donor_inode; 205 ··· 189 } 190 191 down_write(&EXT4_I(first)->i_data_sem); 192 - down_write(&EXT4_I(second)->i_data_sem); 193 } 194 195 /** 196 - * mext_double_up_read - Release two inodes' read semaphore 197 * 198 * @orig_inode: original inode structure to be released its lock first 199 * @donor_inode: donor inode structure to be released its lock second 200 - * Release read semaphore of two inodes (orig and donor). 201 */ 202 static void 203 - mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) 204 - { 205 - up_read(&EXT4_I(orig_inode)->i_data_sem); 206 - up_read(&EXT4_I(donor_inode)->i_data_sem); 207 - } 208 - 209 - /** 210 - * mext_double_up_write - Release two inodes' write semaphore 211 - * 212 - * @orig_inode: original inode structure to be released its lock first 213 - * @donor_inode: donor inode structure to be released its lock second 214 - * Release write semaphore of two inodes (orig and donor). 215 - */ 216 - static void 217 - mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode) 218 { 219 up_write(&EXT4_I(orig_inode)->i_data_sem); 220 up_write(&EXT4_I(donor_inode)->i_data_sem); ··· 568 * @tmp_oext: the extent that will belong to the donor inode 569 * @orig_off: block offset of original inode 570 * @donor_off: block offset of donor inode 571 - * @max_count: the maximun length of extents 572 * 573 * Return 0 on success, or a negative error value on failure. 574 */ ··· 633 * @donor_inode: donor inode 634 * @from: block offset of orig_inode 635 * @count: block count to be replaced 636 * 637 * Replace original inode extents and donor inode extents page by page. 638 * We implement this replacement in the following three steps: ··· 644 * 3. Change the block information of donor inode to point at the saved 645 * original inode blocks in the dummy extents. 646 * 647 - * Return 0 on success, or a negative error value on failure. 648 */ 649 static int 650 mext_replace_branches(handle_t *handle, struct inode *orig_inode, 651 struct inode *donor_inode, ext4_lblk_t from, 652 - ext4_lblk_t count) 653 { 654 struct ext4_ext_path *orig_path = NULL; 655 struct ext4_ext_path *donor_path = NULL; 656 struct ext4_extent *oext, *dext; 657 struct ext4_extent tmp_dext, tmp_oext; 658 ext4_lblk_t orig_off = from, donor_off = from; 659 - int err = 0; 660 int depth; 661 int replaced_count = 0; 662 int dext_alen; 663 664 - mext_double_down_write(orig_inode, donor_inode); 665 666 /* Get the original extent for the block "orig_off" */ 667 - err = get_ext_path(orig_inode, orig_off, &orig_path); 668 - if (err) 669 goto out; 670 671 /* Get the donor extent for the head */ 672 - err = get_ext_path(donor_inode, donor_off, &donor_path); 673 - if (err) 674 goto out; 675 depth = ext_depth(orig_inode); 676 oext = orig_path[depth].p_ext; ··· 680 dext = donor_path[depth].p_ext; 681 tmp_dext = *dext; 682 683 - err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 684 donor_off, count); 685 - if (err) 686 goto out; 687 688 /* Loop for the donor extents */ ··· 691 if (!dext) { 692 ext4_error(donor_inode->i_sb, __func__, 693 "The extent for donor must be found"); 694 - err = -EIO; 695 goto out; 696 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { 697 ext4_error(donor_inode->i_sb, __func__, ··· 699 "extent(%u) should be equal", 700 donor_off, 701 le32_to_cpu(tmp_dext.ee_block)); 702 - err = -EIO; 703 goto out; 704 } 705 706 /* Set donor extent to orig extent */ 707 - err = mext_leaf_block(handle, orig_inode, 708 orig_path, &tmp_dext, &orig_off); 709 - if (err < 0) 710 goto out; 711 712 /* Set orig extent to donor extent */ 713 - err = mext_leaf_block(handle, donor_inode, 714 donor_path, &tmp_oext, &donor_off); 715 - if (err < 0) 716 goto out; 717 718 dext_alen = ext4_ext_get_actual_len(&tmp_dext); ··· 726 727 if (orig_path) 728 ext4_ext_drop_refs(orig_path); 729 - err = get_ext_path(orig_inode, orig_off, &orig_path); 730 - if (err) 731 goto out; 732 depth = ext_depth(orig_inode); 733 oext = orig_path[depth].p_ext; 734 - if (le32_to_cpu(oext->ee_block) + 735 - ext4_ext_get_actual_len(oext) <= orig_off) { 736 - err = 0; 737 - goto out; 738 - } 739 tmp_oext = *oext; 740 741 if (donor_path) 742 ext4_ext_drop_refs(donor_path); 743 - err = get_ext_path(donor_inode, donor_off, &donor_path); 744 - if (err) 745 goto out; 746 depth = ext_depth(donor_inode); 747 dext = donor_path[depth].p_ext; 748 - if (le32_to_cpu(dext->ee_block) + 749 - ext4_ext_get_actual_len(dext) <= donor_off) { 750 - err = 0; 751 - goto out; 752 - } 753 tmp_dext = *dext; 754 755 - err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 756 donor_off, count - replaced_count); 757 - if (err) 758 goto out; 759 } 760 ··· 758 kfree(donor_path); 759 } 760 761 - mext_double_up_write(orig_inode, donor_inode); 762 - return err; 763 } 764 765 /** ··· 775 * @data_offset_in_page: block index where data swapping starts 776 * @block_len_in_page: the number of blocks to be swapped 777 * @uninit: orig extent is uninitialized or not 778 * 779 * Save the data in original inode blocks and replace original inode extents 780 * with donor inode extents by calling mext_replace_branches(). 781 - * Finally, write out the saved data in new original inode blocks. Return 0 782 - * on success, or a negative error value on failure. 783 */ 784 static int 785 move_extent_per_page(struct file *o_filp, struct inode *donor_inode, 786 pgoff_t orig_page_offset, int data_offset_in_page, 787 - int block_len_in_page, int uninit) 788 { 789 struct inode *orig_inode = o_filp->f_dentry->d_inode; 790 struct address_space *mapping = orig_inode->i_mapping; ··· 797 long long offs = orig_page_offset << PAGE_CACHE_SHIFT; 798 unsigned long blocksize = orig_inode->i_sb->s_blocksize; 799 unsigned int w_flags = 0; 800 - unsigned int tmp_data_len, data_len; 801 void *fsdata; 802 - int ret, i, jblocks; 803 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; 804 805 /* ··· 811 jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; 812 handle = ext4_journal_start(orig_inode, jblocks); 813 if (IS_ERR(handle)) { 814 - ret = PTR_ERR(handle); 815 - return ret; 816 } 817 818 if (segment_eq(get_fs(), KERNEL_DS)) ··· 828 * Just swap data blocks between orig and donor. 829 */ 830 if (uninit) { 831 - ret = mext_replace_branches(handle, orig_inode, 832 - donor_inode, orig_blk_offset, 833 - block_len_in_page); 834 - 835 - /* Clear the inode cache not to refer to the old data */ 836 - ext4_ext_invalidate_cache(orig_inode); 837 - ext4_ext_invalidate_cache(donor_inode); 838 goto out2; 839 } 840 841 offs = (long long)orig_blk_offset << orig_inode->i_blkbits; 842 843 - /* Calculate data_len */ 844 if ((orig_blk_offset + block_len_in_page - 1) == 845 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { 846 /* Replace the last block */ 847 - tmp_data_len = orig_inode->i_size & (blocksize - 1); 848 /* 849 - * If data_len equal zero, it shows data_len is multiples of 850 * blocksize. So we set appropriate value. 851 */ 852 - if (tmp_data_len == 0) 853 - tmp_data_len = blocksize; 854 855 - data_len = tmp_data_len + 856 ((block_len_in_page - 1) << orig_inode->i_blkbits); 857 - } else { 858 - data_len = block_len_in_page << orig_inode->i_blkbits; 859 - } 860 861 - ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags, 862 &page, &fsdata); 863 - if (unlikely(ret < 0)) 864 goto out; 865 866 if (!PageUptodate(page)) { ··· 878 /* Release old bh and drop refs */ 879 try_to_release_page(page, 0); 880 881 - ret = mext_replace_branches(handle, orig_inode, donor_inode, 882 - orig_blk_offset, block_len_in_page); 883 - if (ret < 0) 884 - goto out; 885 - 886 - /* Clear the inode cache not to refer to the old data */ 887 - ext4_ext_invalidate_cache(orig_inode); 888 - ext4_ext_invalidate_cache(donor_inode); 889 890 if (!page_has_buffers(page)) 891 create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); ··· 898 bh = bh->b_this_page; 899 900 for (i = 0; i < block_len_in_page; i++) { 901 - ret = ext4_get_block(orig_inode, 902 (sector_t)(orig_blk_offset + i), bh, 0); 903 - if (ret < 0) 904 goto out; 905 906 if (bh->b_this_page != NULL) 907 bh = bh->b_this_page; 908 } 909 910 - ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len, 911 page, fsdata); 912 page = NULL; 913 ··· 921 out2: 922 ext4_journal_stop(handle); 923 924 - return ret < 0 ? ret : 0; 925 } 926 927 /** ··· 935 * @orig_start: logical start offset in block for orig 936 * @donor_start: logical start offset in block for donor 937 * @len: the number of blocks to be moved 938 - * @moved_len: moved block length 939 * 940 * Check the arguments of ext4_move_extents() whether the files can be 941 * exchanged with each other. ··· 942 */ 943 static int 944 mext_check_arguments(struct inode *orig_inode, 945 - struct inode *donor_inode, __u64 orig_start, 946 - __u64 donor_start, __u64 *len, __u64 moved_len) 947 { 948 ext4_lblk_t orig_blocks, donor_blocks; 949 unsigned int blkbits = orig_inode->i_blkbits; ··· 954 ext4_debug("ext4 move extent: The argument files should be " 955 "regular file [ino:orig %lu, donor %lu]\n", 956 orig_inode->i_ino, donor_inode->i_ino); 957 return -EINVAL; 958 } 959 ··· 1001 ext4_debug("ext4 move extent: orig and donor's start " 1002 "offset are not same [ino:orig %lu, donor %lu]\n", 1003 orig_inode->i_ino, donor_inode->i_ino); 1004 - return -EINVAL; 1005 - } 1006 - 1007 - if (moved_len) { 1008 - ext4_debug("ext4 move extent: moved_len should be 0 " 1009 - "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, 1010 - donor_inode->i_ino); 1011 return -EINVAL; 1012 } 1013 ··· 1060 } 1061 1062 if (!*len) { 1063 - ext4_debug("ext4 move extent: len shoudld not be 0 " 1064 "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, 1065 donor_inode->i_ino); 1066 return -EINVAL; ··· 1204 return -EINVAL; 1205 } 1206 1207 - /* protect orig and donor against a truncate */ 1208 ret1 = mext_inode_double_lock(orig_inode, donor_inode); 1209 if (ret1 < 0) 1210 return ret1; 1211 1212 - mext_double_down_read(orig_inode, donor_inode); 1213 /* Check the filesystem environment whether move_extent can be done */ 1214 ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, 1215 - donor_start, &len, *moved_len); 1216 - mext_double_up_read(orig_inode, donor_inode); 1217 if (ret1) 1218 goto out; 1219 ··· 1327 seq_start = le32_to_cpu(ext_cur->ee_block); 1328 rest_blocks = seq_blocks; 1329 1330 - /* Discard preallocations of two inodes */ 1331 - down_write(&EXT4_I(orig_inode)->i_data_sem); 1332 - ext4_discard_preallocations(orig_inode); 1333 - up_write(&EXT4_I(orig_inode)->i_data_sem); 1334 - 1335 - down_write(&EXT4_I(donor_inode)->i_data_sem); 1336 - ext4_discard_preallocations(donor_inode); 1337 - up_write(&EXT4_I(donor_inode)->i_data_sem); 1338 1339 while (orig_page_offset <= seq_end_page) { 1340 1341 /* Swap original branches with new branches */ 1342 - ret1 = move_extent_per_page(o_filp, donor_inode, 1343 orig_page_offset, 1344 data_offset_in_page, 1345 - block_len_in_page, uninit); 1346 - if (ret1 < 0) 1347 - goto out; 1348 - orig_page_offset++; 1349 /* Count how many blocks we have exchanged */ 1350 *moved_len += block_len_in_page; 1351 if (*moved_len > len) { 1352 ext4_error(orig_inode->i_sb, __func__, 1353 "We replaced blocks too much! " 1354 "sum of replaced: %llu requested: %llu", 1355 *moved_len, len); 1356 ret1 = -EIO; 1357 - goto out; 1358 } 1359 1360 data_offset_in_page = 0; 1361 rest_blocks -= block_len_in_page; 1362 if (rest_blocks > blocks_per_page) ··· 1367 else 1368 block_len_in_page = rest_blocks; 1369 } 1370 1371 /* Decrease buffer counter */ 1372 if (holecheck_path) ··· 1393 1394 } 1395 out: 1396 if (orig_path) { 1397 ext4_ext_drop_refs(orig_path); 1398 kfree(orig_path); ··· 1406 ext4_ext_drop_refs(holecheck_path); 1407 kfree(holecheck_path); 1408 } 1409 - 1410 ret2 = mext_inode_double_unlock(orig_inode, donor_inode); 1411 1412 if (ret1)

··· 77 mext_next_extent(struct inode *inode, struct ext4_ext_path *path, 78 struct ext4_extent **extent) 79 { 80 + struct ext4_extent_header *eh; 81 int ppos, leaf_ppos = path->p_depth; 82 83 ppos = leaf_ppos; 84 if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { 85 /* leaf block */ 86 *extent = ++path[ppos].p_ext; 87 + path[ppos].p_block = ext_pblock(path[ppos].p_ext); 88 return 0; 89 } 90 ··· 119 ext_block_hdr(path[cur_ppos+1].p_bh); 120 } 121 122 + path[leaf_ppos].p_ext = *extent = NULL; 123 + 124 + eh = path[leaf_ppos].p_hdr; 125 + if (le16_to_cpu(eh->eh_entries) == 0) 126 + /* empty leaf is found */ 127 + return -ENODATA; 128 + 129 /* leaf block */ 130 path[leaf_ppos].p_ext = *extent = 131 EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); 132 + path[leaf_ppos].p_block = 133 + ext_pblock(path[leaf_ppos].p_ext); 134 return 0; 135 } 136 } ··· 155 } 156 157 /** 158 + * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem 159 * 160 * @orig_inode: original inode structure 161 * @donor_inode: donor inode structure 162 + * Acquire write lock of i_data_sem of the two inodes (orig and donor) by 163 + * i_ino order. 164 */ 165 static void 166 + double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) 167 { 168 struct inode *first = orig_inode, *second = donor_inode; 169 ··· 203 } 204 205 down_write(&EXT4_I(first)->i_data_sem); 206 + down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); 207 } 208 209 /** 210 + * double_up_write_data_sem - Release two inodes' write lock of i_data_sem 211 * 212 * @orig_inode: original inode structure to be released its lock first 213 * @donor_inode: donor inode structure to be released its lock second 214 + * Release write lock of i_data_sem of two inodes (orig and donor). 215 */ 216 static void 217 + double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) 218 { 219 up_write(&EXT4_I(orig_inode)->i_data_sem); 220 up_write(&EXT4_I(donor_inode)->i_data_sem); ··· 596 * @tmp_oext: the extent that will belong to the donor inode 597 * @orig_off: block offset of original inode 598 * @donor_off: block offset of donor inode 599 + * @max_count: the maximum length of extents 600 * 601 * Return 0 on success, or a negative error value on failure. 602 */ ··· 661 * @donor_inode: donor inode 662 * @from: block offset of orig_inode 663 * @count: block count to be replaced 664 + * @err: pointer to save return value 665 * 666 * Replace original inode extents and donor inode extents page by page. 667 * We implement this replacement in the following three steps: ··· 671 * 3. Change the block information of donor inode to point at the saved 672 * original inode blocks in the dummy extents. 673 * 674 + * Return replaced block count. 675 */ 676 static int 677 mext_replace_branches(handle_t *handle, struct inode *orig_inode, 678 struct inode *donor_inode, ext4_lblk_t from, 679 + ext4_lblk_t count, int *err) 680 { 681 struct ext4_ext_path *orig_path = NULL; 682 struct ext4_ext_path *donor_path = NULL; 683 struct ext4_extent *oext, *dext; 684 struct ext4_extent tmp_dext, tmp_oext; 685 ext4_lblk_t orig_off = from, donor_off = from; 686 int depth; 687 int replaced_count = 0; 688 int dext_alen; 689 690 + /* Protect extent trees against block allocations via delalloc */ 691 + double_down_write_data_sem(orig_inode, donor_inode); 692 693 /* Get the original extent for the block "orig_off" */ 694 + *err = get_ext_path(orig_inode, orig_off, &orig_path); 695 + if (*err) 696 goto out; 697 698 /* Get the donor extent for the head */ 699 + *err = get_ext_path(donor_inode, donor_off, &donor_path); 700 + if (*err) 701 goto out; 702 depth = ext_depth(orig_inode); 703 oext = orig_path[depth].p_ext; ··· 707 dext = donor_path[depth].p_ext; 708 tmp_dext = *dext; 709 710 + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 711 donor_off, count); 712 + if (*err) 713 goto out; 714 715 /* Loop for the donor extents */ ··· 718 if (!dext) { 719 ext4_error(donor_inode->i_sb, __func__, 720 "The extent for donor must be found"); 721 + *err = -EIO; 722 goto out; 723 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { 724 ext4_error(donor_inode->i_sb, __func__, ··· 726 "extent(%u) should be equal", 727 donor_off, 728 le32_to_cpu(tmp_dext.ee_block)); 729 + *err = -EIO; 730 goto out; 731 } 732 733 /* Set donor extent to orig extent */ 734 + *err = mext_leaf_block(handle, orig_inode, 735 orig_path, &tmp_dext, &orig_off); 736 + if (*err) 737 goto out; 738 739 /* Set orig extent to donor extent */ 740 + *err = mext_leaf_block(handle, donor_inode, 741 donor_path, &tmp_oext, &donor_off); 742 + if (*err) 743 goto out; 744 745 dext_alen = ext4_ext_get_actual_len(&tmp_dext); ··· 753 754 if (orig_path) 755 ext4_ext_drop_refs(orig_path); 756 + *err = get_ext_path(orig_inode, orig_off, &orig_path); 757 + if (*err) 758 goto out; 759 depth = ext_depth(orig_inode); 760 oext = orig_path[depth].p_ext; 761 tmp_oext = *oext; 762 763 if (donor_path) 764 ext4_ext_drop_refs(donor_path); 765 + *err = get_ext_path(donor_inode, donor_off, &donor_path); 766 + if (*err) 767 goto out; 768 depth = ext_depth(donor_inode); 769 dext = donor_path[depth].p_ext; 770 tmp_dext = *dext; 771 772 + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 773 donor_off, count - replaced_count); 774 + if (*err) 775 goto out; 776 } 777 ··· 795 kfree(donor_path); 796 } 797 798 + ext4_ext_invalidate_cache(orig_inode); 799 + ext4_ext_invalidate_cache(donor_inode); 800 + 801 + double_up_write_data_sem(orig_inode, donor_inode); 802 + 803 + return replaced_count; 804 } 805 806 /** ··· 808 * @data_offset_in_page: block index where data swapping starts 809 * @block_len_in_page: the number of blocks to be swapped 810 * @uninit: orig extent is uninitialized or not 811 + * @err: pointer to save return value 812 * 813 * Save the data in original inode blocks and replace original inode extents 814 * with donor inode extents by calling mext_replace_branches(). 815 + * Finally, write out the saved data in new original inode blocks. Return 816 + * replaced block count. 817 */ 818 static int 819 move_extent_per_page(struct file *o_filp, struct inode *donor_inode, 820 pgoff_t orig_page_offset, int data_offset_in_page, 821 + int block_len_in_page, int uninit, int *err) 822 { 823 struct inode *orig_inode = o_filp->f_dentry->d_inode; 824 struct address_space *mapping = orig_inode->i_mapping; ··· 829 long long offs = orig_page_offset << PAGE_CACHE_SHIFT; 830 unsigned long blocksize = orig_inode->i_sb->s_blocksize; 831 unsigned int w_flags = 0; 832 + unsigned int tmp_data_size, data_size, replaced_size; 833 void *fsdata; 834 + int i, jblocks; 835 + int err2 = 0; 836 + int replaced_count = 0; 837 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; 838 839 /* ··· 841 jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; 842 handle = ext4_journal_start(orig_inode, jblocks); 843 if (IS_ERR(handle)) { 844 + *err = PTR_ERR(handle); 845 + return 0; 846 } 847 848 if (segment_eq(get_fs(), KERNEL_DS)) ··· 858 * Just swap data blocks between orig and donor. 859 */ 860 if (uninit) { 861 + replaced_count = mext_replace_branches(handle, orig_inode, 862 + donor_inode, orig_blk_offset, 863 + block_len_in_page, err); 864 goto out2; 865 } 866 867 offs = (long long)orig_blk_offset << orig_inode->i_blkbits; 868 869 + /* Calculate data_size */ 870 if ((orig_blk_offset + block_len_in_page - 1) == 871 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { 872 /* Replace the last block */ 873 + tmp_data_size = orig_inode->i_size & (blocksize - 1); 874 /* 875 + * If data_size equal zero, it shows data_size is multiples of 876 * blocksize. So we set appropriate value. 877 */ 878 + if (tmp_data_size == 0) 879 + tmp_data_size = blocksize; 880 881 + data_size = tmp_data_size + 882 ((block_len_in_page - 1) << orig_inode->i_blkbits); 883 + } else 884 + data_size = block_len_in_page << orig_inode->i_blkbits; 885 886 + replaced_size = data_size; 887 + 888 + *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, 889 &page, &fsdata); 890 + if (unlikely(*err < 0)) 891 goto out; 892 893 if (!PageUptodate(page)) { ··· 911 /* Release old bh and drop refs */ 912 try_to_release_page(page, 0); 913 914 + replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, 915 + orig_blk_offset, block_len_in_page, 916 + &err2); 917 + if (err2) { 918 + if (replaced_count) { 919 + block_len_in_page = replaced_count; 920 + replaced_size = 921 + block_len_in_page << orig_inode->i_blkbits; 922 + } else 923 + goto out; 924 + } 925 926 if (!page_has_buffers(page)) 927 create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); ··· 928 bh = bh->b_this_page; 929 930 for (i = 0; i < block_len_in_page; i++) { 931 + *err = ext4_get_block(orig_inode, 932 (sector_t)(orig_blk_offset + i), bh, 0); 933 + if (*err < 0) 934 goto out; 935 936 if (bh->b_this_page != NULL) 937 bh = bh->b_this_page; 938 } 939 940 + *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size, 941 page, fsdata); 942 page = NULL; 943 ··· 951 out2: 952 ext4_journal_stop(handle); 953 954 + if (err2) 955 + *err = err2; 956 + 957 + return replaced_count; 958 } 959 960 /** ··· 962 * @orig_start: logical start offset in block for orig 963 * @donor_start: logical start offset in block for donor 964 * @len: the number of blocks to be moved 965 * 966 * Check the arguments of ext4_move_extents() whether the files can be 967 * exchanged with each other. ··· 970 */ 971 static int 972 mext_check_arguments(struct inode *orig_inode, 973 + struct inode *donor_inode, __u64 orig_start, 974 + __u64 donor_start, __u64 *len) 975 { 976 ext4_lblk_t orig_blocks, donor_blocks; 977 unsigned int blkbits = orig_inode->i_blkbits; ··· 982 ext4_debug("ext4 move extent: The argument files should be " 983 "regular file [ino:orig %lu, donor %lu]\n", 984 orig_inode->i_ino, donor_inode->i_ino); 985 + return -EINVAL; 986 + } 987 + 988 + if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { 989 + ext4_debug("ext4 move extent: suid or sgid is set" 990 + " to donor file [ino:orig %lu, donor %lu]\n", 991 + orig_inode->i_ino, donor_inode->i_ino); 992 return -EINVAL; 993 } 994 ··· 1022 ext4_debug("ext4 move extent: orig and donor's start " 1023 "offset are not same [ino:orig %lu, donor %lu]\n", 1024 orig_inode->i_ino, donor_inode->i_ino); 1025 return -EINVAL; 1026 } 1027 ··· 1088 } 1089 1090 if (!*len) { 1091 + ext4_debug("ext4 move extent: len should not be 0 " 1092 "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, 1093 donor_inode->i_ino); 1094 return -EINVAL; ··· 1232 return -EINVAL; 1233 } 1234 1235 + /* Protect orig and donor inodes against a truncate */ 1236 ret1 = mext_inode_double_lock(orig_inode, donor_inode); 1237 if (ret1 < 0) 1238 return ret1; 1239 1240 + /* Protect extent tree against block allocations via delalloc */ 1241 + double_down_write_data_sem(orig_inode, donor_inode); 1242 /* Check the filesystem environment whether move_extent can be done */ 1243 ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, 1244 + donor_start, &len); 1245 if (ret1) 1246 goto out; 1247 ··· 1355 seq_start = le32_to_cpu(ext_cur->ee_block); 1356 rest_blocks = seq_blocks; 1357 1358 + /* 1359 + * Up semaphore to avoid following problems: 1360 + * a. transaction deadlock among ext4_journal_start, 1361 + * ->write_begin via pagefault, and jbd2_journal_commit 1362 + * b. racing with ->readpage, ->write_begin, and ext4_get_block 1363 + * in move_extent_per_page 1364 + */ 1365 + double_up_write_data_sem(orig_inode, donor_inode); 1366 1367 while (orig_page_offset <= seq_end_page) { 1368 1369 /* Swap original branches with new branches */ 1370 + block_len_in_page = move_extent_per_page( 1371 + o_filp, donor_inode, 1372 orig_page_offset, 1373 data_offset_in_page, 1374 + block_len_in_page, uninit, 1375 + &ret1); 1376 + 1377 /* Count how many blocks we have exchanged */ 1378 *moved_len += block_len_in_page; 1379 + if (ret1 < 0) 1380 + break; 1381 if (*moved_len > len) { 1382 ext4_error(orig_inode->i_sb, __func__, 1383 "We replaced blocks too much! " 1384 "sum of replaced: %llu requested: %llu", 1385 *moved_len, len); 1386 ret1 = -EIO; 1387 + break; 1388 } 1389 1390 + orig_page_offset++; 1391 data_offset_in_page = 0; 1392 rest_blocks -= block_len_in_page; 1393 if (rest_blocks > blocks_per_page) ··· 1392 else 1393 block_len_in_page = rest_blocks; 1394 } 1395 + 1396 + double_down_write_data_sem(orig_inode, donor_inode); 1397 + if (ret1 < 0) 1398 + break; 1399 1400 /* Decrease buffer counter */ 1401 if (holecheck_path) ··· 1414 1415 } 1416 out: 1417 + if (*moved_len) { 1418 + ext4_discard_preallocations(orig_inode); 1419 + ext4_discard_preallocations(donor_inode); 1420 + } 1421 + 1422 if (orig_path) { 1423 ext4_ext_drop_refs(orig_path); 1424 kfree(orig_path); ··· 1422 ext4_ext_drop_refs(holecheck_path); 1423 kfree(holecheck_path); 1424 } 1425 + double_up_write_data_sem(orig_inode, donor_inode); 1426 ret2 = mext_inode_double_unlock(orig_inode, donor_inode); 1427 1428 if (ret1)

+16 -22

fs/ext4/namei.c

··· 1292 * add_dirent_to_buf will attempt search the directory block for 1293 * space. It will return -ENOSPC if no space is available, and -EIO 1294 * and -EEXIST if directory entry already exists. 1295 - * 1296 - * NOTE! bh is NOT released in the case where ENOSPC is returned. In 1297 - * all other cases bh is released. 1298 */ 1299 static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, 1300 struct inode *inode, struct ext4_dir_entry_2 *de, ··· 1312 top = bh->b_data + blocksize - reclen; 1313 while ((char *) de <= top) { 1314 if (!ext4_check_dir_entry("ext4_add_entry", dir, de, 1315 - bh, offset)) { 1316 - brelse(bh); 1317 return -EIO; 1318 - } 1319 - if (ext4_match(namelen, name, de)) { 1320 - brelse(bh); 1321 return -EEXIST; 1322 - } 1323 nlen = EXT4_DIR_REC_LEN(de->name_len); 1324 rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); 1325 if ((de->inode? rlen - nlen: rlen) >= reclen) ··· 1330 err = ext4_journal_get_write_access(handle, bh); 1331 if (err) { 1332 ext4_std_error(dir->i_sb, err); 1333 - brelse(bh); 1334 return err; 1335 } 1336 ··· 1369 err = ext4_handle_dirty_metadata(handle, dir, bh); 1370 if (err) 1371 ext4_std_error(dir->i_sb, err); 1372 - brelse(bh); 1373 return 0; 1374 } 1375 ··· 1462 if (!(de)) 1463 return retval; 1464 1465 - return add_dirent_to_buf(handle, dentry, inode, de, bh); 1466 } 1467 1468 /* ··· 1507 if(!bh) 1508 return retval; 1509 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); 1510 - if (retval != -ENOSPC) 1511 return retval; 1512 1513 if (blocks == 1 && !dx_fallback && 1514 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) ··· 1523 de = (struct ext4_dir_entry_2 *) bh->b_data; 1524 de->inode = 0; 1525 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); 1526 - return add_dirent_to_buf(handle, dentry, inode, de, bh); 1527 } 1528 1529 /* ··· 1558 goto journal_error; 1559 1560 err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); 1561 - if (err != -ENOSPC) { 1562 - bh = NULL; 1563 goto cleanup; 1564 - } 1565 1566 /* Block full, should compress but for now just split */ 1567 dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", ··· 1652 if (!de) 1653 goto cleanup; 1654 err = add_dirent_to_buf(handle, dentry, inode, de, bh); 1655 - bh = NULL; 1656 goto cleanup; 1657 1658 journal_error: ··· 1769 retry: 1770 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 1771 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1772 - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); 1773 if (IS_ERR(handle)) 1774 return PTR_ERR(handle); 1775 ··· 1803 retry: 1804 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 1805 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1806 - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); 1807 if (IS_ERR(handle)) 1808 return PTR_ERR(handle); 1809 ··· 1840 retry: 1841 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 1842 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1843 - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); 1844 if (IS_ERR(handle)) 1845 return PTR_ERR(handle); 1846 ··· 2253 retry: 2254 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2255 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + 2256 - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); 2257 if (IS_ERR(handle)) 2258 return PTR_ERR(handle); 2259

··· 1292 * add_dirent_to_buf will attempt search the directory block for 1293 * space. It will return -ENOSPC if no space is available, and -EIO 1294 * and -EEXIST if directory entry already exists. 1295 */ 1296 static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, 1297 struct inode *inode, struct ext4_dir_entry_2 *de, ··· 1315 top = bh->b_data + blocksize - reclen; 1316 while ((char *) de <= top) { 1317 if (!ext4_check_dir_entry("ext4_add_entry", dir, de, 1318 + bh, offset)) 1319 return -EIO; 1320 + if (ext4_match(namelen, name, de)) 1321 return -EEXIST; 1322 nlen = EXT4_DIR_REC_LEN(de->name_len); 1323 rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); 1324 if ((de->inode? rlen - nlen: rlen) >= reclen) ··· 1337 err = ext4_journal_get_write_access(handle, bh); 1338 if (err) { 1339 ext4_std_error(dir->i_sb, err); 1340 return err; 1341 } 1342 ··· 1377 err = ext4_handle_dirty_metadata(handle, dir, bh); 1378 if (err) 1379 ext4_std_error(dir->i_sb, err); 1380 return 0; 1381 } 1382 ··· 1471 if (!(de)) 1472 return retval; 1473 1474 + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); 1475 + brelse(bh); 1476 + return retval; 1477 } 1478 1479 /* ··· 1514 if(!bh) 1515 return retval; 1516 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); 1517 + if (retval != -ENOSPC) { 1518 + brelse(bh); 1519 return retval; 1520 + } 1521 1522 if (blocks == 1 && !dx_fallback && 1523 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) ··· 1528 de = (struct ext4_dir_entry_2 *) bh->b_data; 1529 de->inode = 0; 1530 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); 1531 + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); 1532 + brelse(bh); 1533 + return retval; 1534 } 1535 1536 /* ··· 1561 goto journal_error; 1562 1563 err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); 1564 + if (err != -ENOSPC) 1565 goto cleanup; 1566 1567 /* Block full, should compress but for now just split */ 1568 dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", ··· 1657 if (!de) 1658 goto cleanup; 1659 err = add_dirent_to_buf(handle, dentry, inode, de, bh); 1660 goto cleanup; 1661 1662 journal_error: ··· 1775 retry: 1776 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 1777 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1778 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); 1779 if (IS_ERR(handle)) 1780 return PTR_ERR(handle); 1781 ··· 1809 retry: 1810 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 1811 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1812 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); 1813 if (IS_ERR(handle)) 1814 return PTR_ERR(handle); 1815 ··· 1846 retry: 1847 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 1848 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1849 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); 1850 if (IS_ERR(handle)) 1851 return PTR_ERR(handle); 1852 ··· 2259 retry: 2260 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2261 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + 2262 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); 2263 if (IS_ERR(handle)) 2264 return PTR_ERR(handle); 2265

+1 -1

fs/ext4/resize.c

··· 247 goto exit_bh; 248 249 if (IS_ERR(gdb = bclean(handle, sb, block))) { 250 - err = PTR_ERR(bh); 251 goto exit_bh; 252 } 253 ext4_handle_dirty_metadata(handle, NULL, gdb);

··· 247 goto exit_bh; 248 249 if (IS_ERR(gdb = bclean(handle, sb, block))) { 250 + err = PTR_ERR(gdb); 251 goto exit_bh; 252 } 253 ext4_handle_dirty_metadata(handle, NULL, gdb);

+87 -31

fs/ext4/super.c

··· 603 if (sb->s_dirt) 604 ext4_commit_super(sb, 1); 605 606 - ext4_release_system_zone(sb); 607 - ext4_mb_release(sb); 608 - ext4_ext_release(sb); 609 - ext4_xattr_put_super(sb); 610 if (sbi->s_journal) { 611 err = jbd2_journal_destroy(sbi->s_journal); 612 sbi->s_journal = NULL; ··· 610 ext4_abort(sb, __func__, 611 "Couldn't clean up the journal"); 612 } 613 if (!(sb->s_flags & MS_RDONLY)) { 614 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 615 es->s_state = cpu_to_le16(sbi->s_mount_state); ··· 706 spin_lock_init(&(ei->i_block_reservation_lock)); 707 INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); 708 ei->cur_aio_dio = NULL; 709 710 return &ei->vfs_inode; 711 } ··· 903 if (test_opt(sb, NO_AUTO_DA_ALLOC)) 904 seq_puts(seq, ",noauto_da_alloc"); 905 906 ext4_show_quota_options(seq, sb); 907 908 return 0; ··· 1089 Opt_usrquota, Opt_grpquota, Opt_i_version, 1090 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1091 Opt_block_validity, Opt_noblock_validity, 1092 - Opt_inode_readahead_blks, Opt_journal_ioprio 1093 }; 1094 1095 static const match_table_t tokens = { ··· 1115 {Opt_acl, "acl"}, 1116 {Opt_noacl, "noacl"}, 1117 {Opt_noload, "noload"}, 1118 {Opt_nobh, "nobh"}, 1119 {Opt_bh, "bh"}, 1120 {Opt_commit, "commit=%u"}, ··· 1156 {Opt_auto_da_alloc, "auto_da_alloc=%u"}, 1157 {Opt_auto_da_alloc, "auto_da_alloc"}, 1158 {Opt_noauto_da_alloc, "noauto_da_alloc"}, 1159 {Opt_err, NULL}, 1160 }; 1161 ··· 1579 else 1580 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); 1581 break; 1582 default: 1583 ext4_msg(sb, KERN_ERR, 1584 "Unrecognized mount option \"%s\" " ··· 1693 size_t size; 1694 int i; 1695 1696 - if (!sbi->s_es->s_log_groups_per_flex) { 1697 sbi->s_log_groups_per_flex = 0; 1698 return 1; 1699 } 1700 - 1701 - sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; 1702 - groups_per_flex = 1 << sbi->s_log_groups_per_flex; 1703 1704 /* We allocate both existing and potentially added groups */ 1705 flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + ··· 2741 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { 2742 if (ext4_load_journal(sb, es, journal_devnum)) 2743 goto failed_mount3; 2744 - if (!(sb->s_flags & MS_RDONLY) && 2745 - EXT4_SB(sb)->s_journal->j_failed_commit) { 2746 - ext4_msg(sb, KERN_CRIT, "error: " 2747 - "ext4_fill_super: Journal transaction " 2748 - "%u is corrupt", 2749 - EXT4_SB(sb)->s_journal->j_failed_commit); 2750 - if (test_opt(sb, ERRORS_RO)) { 2751 - ext4_msg(sb, KERN_CRIT, 2752 - "Mounting filesystem read-only"); 2753 - sb->s_flags |= MS_RDONLY; 2754 - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 2755 - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 2756 - } 2757 - if (test_opt(sb, ERRORS_PANIC)) { 2758 - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 2759 - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 2760 - ext4_commit_super(sb, 1); 2761 - goto failed_mount4; 2762 - } 2763 - } 2764 } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && 2765 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { 2766 ext4_msg(sb, KERN_ERR, "required journal recovery " ··· 3668 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; 3669 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - 3670 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); 3671 - ext4_free_blocks_count_set(es, buf->f_bfree); 3672 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); 3673 if (buf->f_bfree < ext4_r_blocks_count(es)) 3674 buf->f_bavail = 0; 3675 buf->f_files = le32_to_cpu(es->s_inodes_count); 3676 buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); 3677 - es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); 3678 buf->f_namelen = EXT4_NAME_LEN; 3679 fsid = le64_to_cpup((void *)es->s_uuid) ^ 3680 le64_to_cpup((void *)es->s_uuid + sizeof(u64)); ··· 3964 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); 3965 } 3966 3967 static struct file_system_type ext4_fs_type = { 3968 .owner = THIS_MODULE, 3969 .name = "ext4", ··· 4045 err = init_inodecache(); 4046 if (err) 4047 goto out1; 4048 err = register_filesystem(&ext4_fs_type); 4049 if (err) 4050 goto out; 4051 return 0; 4052 out: 4053 destroy_inodecache(); 4054 out1: 4055 exit_ext4_xattr(); ··· 4069 4070 static void __exit exit_ext4_fs(void) 4071 { 4072 unregister_filesystem(&ext4_fs_type); 4073 destroy_inodecache(); 4074 exit_ext4_xattr();

··· 603 if (sb->s_dirt) 604 ext4_commit_super(sb, 1); 605 606 if (sbi->s_journal) { 607 err = jbd2_journal_destroy(sbi->s_journal); 608 sbi->s_journal = NULL; ··· 614 ext4_abort(sb, __func__, 615 "Couldn't clean up the journal"); 616 } 617 + 618 + ext4_release_system_zone(sb); 619 + ext4_mb_release(sb); 620 + ext4_ext_release(sb); 621 + ext4_xattr_put_super(sb); 622 + 623 if (!(sb->s_flags & MS_RDONLY)) { 624 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 625 es->s_state = cpu_to_le16(sbi->s_mount_state); ··· 704 spin_lock_init(&(ei->i_block_reservation_lock)); 705 INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); 706 ei->cur_aio_dio = NULL; 707 + ei->i_sync_tid = 0; 708 + ei->i_datasync_tid = 0; 709 710 return &ei->vfs_inode; 711 } ··· 899 if (test_opt(sb, NO_AUTO_DA_ALLOC)) 900 seq_puts(seq, ",noauto_da_alloc"); 901 902 + if (test_opt(sb, DISCARD)) 903 + seq_puts(seq, ",discard"); 904 + 905 + if (test_opt(sb, NOLOAD)) 906 + seq_puts(seq, ",norecovery"); 907 + 908 ext4_show_quota_options(seq, sb); 909 910 return 0; ··· 1079 Opt_usrquota, Opt_grpquota, Opt_i_version, 1080 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1081 Opt_block_validity, Opt_noblock_validity, 1082 + Opt_inode_readahead_blks, Opt_journal_ioprio, 1083 + Opt_discard, Opt_nodiscard, 1084 }; 1085 1086 static const match_table_t tokens = { ··· 1104 {Opt_acl, "acl"}, 1105 {Opt_noacl, "noacl"}, 1106 {Opt_noload, "noload"}, 1107 + {Opt_noload, "norecovery"}, 1108 {Opt_nobh, "nobh"}, 1109 {Opt_bh, "bh"}, 1110 {Opt_commit, "commit=%u"}, ··· 1144 {Opt_auto_da_alloc, "auto_da_alloc=%u"}, 1145 {Opt_auto_da_alloc, "auto_da_alloc"}, 1146 {Opt_noauto_da_alloc, "noauto_da_alloc"}, 1147 + {Opt_discard, "discard"}, 1148 + {Opt_nodiscard, "nodiscard"}, 1149 {Opt_err, NULL}, 1150 }; 1151 ··· 1565 else 1566 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); 1567 break; 1568 + case Opt_discard: 1569 + set_opt(sbi->s_mount_opt, DISCARD); 1570 + break; 1571 + case Opt_nodiscard: 1572 + clear_opt(sbi->s_mount_opt, DISCARD); 1573 + break; 1574 default: 1575 ext4_msg(sb, KERN_ERR, 1576 "Unrecognized mount option \"%s\" " ··· 1673 size_t size; 1674 int i; 1675 1676 + sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; 1677 + groups_per_flex = 1 << sbi->s_log_groups_per_flex; 1678 + 1679 + if (groups_per_flex < 2) { 1680 sbi->s_log_groups_per_flex = 0; 1681 return 1; 1682 } 1683 1684 /* We allocate both existing and potentially added groups */ 1685 flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + ··· 2721 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { 2722 if (ext4_load_journal(sb, es, journal_devnum)) 2723 goto failed_mount3; 2724 } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && 2725 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { 2726 ext4_msg(sb, KERN_ERR, "required journal recovery " ··· 3668 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; 3669 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - 3670 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); 3671 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); 3672 if (buf->f_bfree < ext4_r_blocks_count(es)) 3673 buf->f_bavail = 0; 3674 buf->f_files = le32_to_cpu(es->s_inodes_count); 3675 buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); 3676 buf->f_namelen = EXT4_NAME_LEN; 3677 fsid = le64_to_cpup((void *)es->s_uuid) ^ 3678 le64_to_cpup((void *)es->s_uuid + sizeof(u64)); ··· 3966 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); 3967 } 3968 3969 + #if !defined(CONTIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 3970 + static struct file_system_type ext2_fs_type = { 3971 + .owner = THIS_MODULE, 3972 + .name = "ext2", 3973 + .get_sb = ext4_get_sb, 3974 + .kill_sb = kill_block_super, 3975 + .fs_flags = FS_REQUIRES_DEV, 3976 + }; 3977 + 3978 + static inline void register_as_ext2(void) 3979 + { 3980 + int err = register_filesystem(&ext2_fs_type); 3981 + if (err) 3982 + printk(KERN_WARNING 3983 + "EXT4-fs: Unable to register as ext2 (%d)\n", err); 3984 + } 3985 + 3986 + static inline void unregister_as_ext2(void) 3987 + { 3988 + unregister_filesystem(&ext2_fs_type); 3989 + } 3990 + #else 3991 + static inline void register_as_ext2(void) { } 3992 + static inline void unregister_as_ext2(void) { } 3993 + #endif 3994 + 3995 + #if !defined(CONTIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 3996 + static struct file_system_type ext3_fs_type = { 3997 + .owner = THIS_MODULE, 3998 + .name = "ext3", 3999 + .get_sb = ext4_get_sb, 4000 + .kill_sb = kill_block_super, 4001 + .fs_flags = FS_REQUIRES_DEV, 4002 + }; 4003 + 4004 + static inline void register_as_ext3(void) 4005 + { 4006 + int err = register_filesystem(&ext3_fs_type); 4007 + if (err) 4008 + printk(KERN_WARNING 4009 + "EXT4-fs: Unable to register as ext3 (%d)\n", err); 4010 + } 4011 + 4012 + static inline void unregister_as_ext3(void) 4013 + { 4014 + unregister_filesystem(&ext3_fs_type); 4015 + } 4016 + #else 4017 + static inline void register_as_ext3(void) { } 4018 + static inline void unregister_as_ext3(void) { } 4019 + #endif 4020 + 4021 static struct file_system_type ext4_fs_type = { 4022 .owner = THIS_MODULE, 4023 .name = "ext4", ··· 3995 err = init_inodecache(); 3996 if (err) 3997 goto out1; 3998 + register_as_ext2(); 3999 + register_as_ext3(); 4000 err = register_filesystem(&ext4_fs_type); 4001 if (err) 4002 goto out; 4003 return 0; 4004 out: 4005 + unregister_as_ext2(); 4006 + unregister_as_ext3(); 4007 destroy_inodecache(); 4008 out1: 4009 exit_ext4_xattr(); ··· 4015 4016 static void __exit exit_ext4_fs(void) 4017 { 4018 + unregister_as_ext2(); 4019 + unregister_as_ext3(); 4020 unregister_filesystem(&ext4_fs_type); 4021 destroy_inodecache(); 4022 exit_ext4_xattr();

+9 -6

fs/ext4/xattr.c

··· 482 ea_bdebug(bh, "refcount now=0; freeing"); 483 if (ce) 484 mb_cache_entry_free(ce); 485 - ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1); 486 get_bh(bh); 487 - ext4_forget(handle, 1, inode, bh, bh->b_blocknr); 488 } else { 489 le32_add_cpu(&BHDR(bh)->h_refcount, -1); 490 error = ext4_handle_dirty_metadata(handle, inode, bh); ··· 833 new_bh = sb_getblk(sb, block); 834 if (!new_bh) { 835 getblk_failed: 836 - ext4_free_blocks(handle, inode, block, 1, 1); 837 error = -EIO; 838 goto cleanup; 839 } ··· 990 if (error) 991 goto cleanup; 992 993 if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { 994 struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); 995 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); ··· 1019 if (flags & XATTR_CREATE) 1020 goto cleanup; 1021 } 1022 - error = ext4_journal_get_write_access(handle, is.iloc.bh); 1023 - if (error) 1024 - goto cleanup; 1025 if (!value) { 1026 if (!is.s.not_found) 1027 error = ext4_xattr_ibody_set(handle, inode, &i, &is);

··· 482 ea_bdebug(bh, "refcount now=0; freeing"); 483 if (ce) 484 mb_cache_entry_free(ce); 485 get_bh(bh); 486 + ext4_free_blocks(handle, inode, bh, 0, 1, 487 + EXT4_FREE_BLOCKS_METADATA | 488 + EXT4_FREE_BLOCKS_FORGET); 489 } else { 490 le32_add_cpu(&BHDR(bh)->h_refcount, -1); 491 error = ext4_handle_dirty_metadata(handle, inode, bh); ··· 832 new_bh = sb_getblk(sb, block); 833 if (!new_bh) { 834 getblk_failed: 835 + ext4_free_blocks(handle, inode, 0, block, 1, 836 + EXT4_FREE_BLOCKS_METADATA); 837 error = -EIO; 838 goto cleanup; 839 } ··· 988 if (error) 989 goto cleanup; 990 991 + error = ext4_journal_get_write_access(handle, is.iloc.bh); 992 + if (error) 993 + goto cleanup; 994 + 995 if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { 996 struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); 997 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); ··· 1013 if (flags & XATTR_CREATE) 1014 goto cleanup; 1015 } 1016 if (!value) { 1017 if (!is.s.not_found) 1018 error = ext4_xattr_ibody_set(handle, inode, &i, &is);

+4

fs/jbd2/commit.c

··· 636 JBUFFER_TRACE(jh, "ph3: write metadata"); 637 flags = jbd2_journal_write_metadata_buffer(commit_transaction, 638 jh, &new_jh, blocknr); 639 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); 640 wbuf[bufs++] = jh2bh(new_jh); 641

··· 636 JBUFFER_TRACE(jh, "ph3: write metadata"); 637 flags = jbd2_journal_write_metadata_buffer(commit_transaction, 638 jh, &new_jh, blocknr); 639 + if (flags < 0) { 640 + jbd2_journal_abort(journal, flags); 641 + continue; 642 + } 643 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); 644 wbuf[bufs++] = jh2bh(new_jh); 645

+12

fs/jbd2/journal.c

··· 78 EXPORT_SYMBOL(jbd2_journal_ack_err); 79 EXPORT_SYMBOL(jbd2_journal_clear_err); 80 EXPORT_SYMBOL(jbd2_log_wait_commit); 81 EXPORT_SYMBOL(jbd2_journal_start_commit); 82 EXPORT_SYMBOL(jbd2_journal_force_commit_nested); 83 EXPORT_SYMBOL(jbd2_journal_wipe); ··· 359 360 jbd_unlock_bh_state(bh_in); 361 tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); 362 jbd_lock_bh_state(bh_in); 363 if (jh_in->b_frozen_data) { 364 jbd2_free(tmp, bh_in->b_size); ··· 1252 * data from the journal. */ 1253 if (jbd2_journal_recover(journal)) 1254 goto recovery_error; 1255 1256 /* OK, we've finished with the dynamic journal bits: 1257 * reinitialise the dynamic contents of the superblock in memory

··· 78 EXPORT_SYMBOL(jbd2_journal_ack_err); 79 EXPORT_SYMBOL(jbd2_journal_clear_err); 80 EXPORT_SYMBOL(jbd2_log_wait_commit); 81 + EXPORT_SYMBOL(jbd2_log_start_commit); 82 EXPORT_SYMBOL(jbd2_journal_start_commit); 83 EXPORT_SYMBOL(jbd2_journal_force_commit_nested); 84 EXPORT_SYMBOL(jbd2_journal_wipe); ··· 358 359 jbd_unlock_bh_state(bh_in); 360 tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); 361 + if (!tmp) { 362 + jbd2_journal_put_journal_head(new_jh); 363 + return -ENOMEM; 364 + } 365 jbd_lock_bh_state(bh_in); 366 if (jh_in->b_frozen_data) { 367 jbd2_free(tmp, bh_in->b_size); ··· 1247 * data from the journal. */ 1248 if (jbd2_journal_recover(journal)) 1249 goto recovery_error; 1250 + 1251 + if (journal->j_failed_commit) { 1252 + printk(KERN_ERR "JBD2: journal transaction %u on %s " 1253 + "is corrupt.\n", journal->j_failed_commit, 1254 + journal->j_devname); 1255 + return -EIO; 1256 + } 1257 1258 /* OK, we've finished with the dynamic journal bits: 1259 * reinitialise the dynamic contents of the superblock in memory

+40 -14

include/trace/events/ext4.h

··· 38 __entry->blocks = inode->i_blocks; 39 ), 40 41 - TP_printk("dev %s ino %lu mode %d uid %u gid %u blocks %llu", 42 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, 43 __entry->mode, __entry->uid, __entry->gid, 44 (unsigned long long) __entry->blocks) ··· 61 __entry->mode = mode; 62 ), 63 64 - TP_printk("dev %s dir %lu mode %d", 65 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->dir, 66 __entry->mode) 67 ); ··· 85 __entry->mode = mode; 86 ), 87 88 - TP_printk("dev %s ino %lu dir %lu mode %d", 89 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, 90 (unsigned long) __entry->dir, __entry->mode) 91 ); ··· 305 __field( int, ret ) 306 __field( int, pages_written ) 307 __field( long, pages_skipped ) 308 - __field( char, encountered_congestion ) 309 __field( char, more_io ) 310 __field( char, no_nrwrite_index_update ) 311 __field( pgoff_t, writeback_index ) ··· 316 __entry->ret = ret; 317 __entry->pages_written = pages_written; 318 __entry->pages_skipped = wbc->pages_skipped; 319 - __entry->encountered_congestion = wbc->encountered_congestion; 320 __entry->more_io = wbc->more_io; 321 __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update; 322 __entry->writeback_index = inode->i_mapping->writeback_index; 323 ), 324 325 - TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d writeback_index %lu", 326 jbd2_dev_to_name(__entry->dev), 327 (unsigned long) __entry->ino, __entry->ret, 328 __entry->pages_written, __entry->pages_skipped, 329 - __entry->encountered_congestion, __entry->more_io, 330 __entry->no_nrwrite_index_update, 331 (unsigned long) __entry->writeback_index) 332 ); ··· 589 590 TRACE_EVENT(ext4_free_blocks, 591 TP_PROTO(struct inode *inode, __u64 block, unsigned long count, 592 - int metadata), 593 594 - TP_ARGS(inode, block, count, metadata), 595 596 TP_STRUCT__entry( 597 __field( dev_t, dev ) 598 __field( ino_t, ino ) 599 __field( __u64, block ) 600 __field( unsigned long, count ) 601 - __field( int, metadata ) 602 - 603 ), 604 605 TP_fast_assign( 606 __entry->dev = inode->i_sb->s_dev; 607 __entry->ino = inode->i_ino; 608 __entry->block = block; 609 __entry->count = count; 610 - __entry->metadata = metadata; 611 ), 612 613 - TP_printk("dev %s ino %lu block %llu count %lu metadata %d", 614 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, 615 - __entry->block, __entry->count, __entry->metadata) 616 ); 617 618 TRACE_EVENT(ext4_sync_file, ··· 846 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, 847 __entry->result_group, __entry->result_start, 848 __entry->result_len, __entry->result_logical) 849 ); 850 851 #endif /* _TRACE_EXT4_H */

··· 38 __entry->blocks = inode->i_blocks; 39 ), 40 41 + TP_printk("dev %s ino %lu mode 0%o uid %u gid %u blocks %llu", 42 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, 43 __entry->mode, __entry->uid, __entry->gid, 44 (unsigned long long) __entry->blocks) ··· 61 __entry->mode = mode; 62 ), 63 64 + TP_printk("dev %s dir %lu mode 0%o", 65 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->dir, 66 __entry->mode) 67 ); ··· 85 __entry->mode = mode; 86 ), 87 88 + TP_printk("dev %s ino %lu dir %lu mode 0%o", 89 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, 90 (unsigned long) __entry->dir, __entry->mode) 91 ); ··· 305 __field( int, ret ) 306 __field( int, pages_written ) 307 __field( long, pages_skipped ) 308 __field( char, more_io ) 309 __field( char, no_nrwrite_index_update ) 310 __field( pgoff_t, writeback_index ) ··· 317 __entry->ret = ret; 318 __entry->pages_written = pages_written; 319 __entry->pages_skipped = wbc->pages_skipped; 320 __entry->more_io = wbc->more_io; 321 __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update; 322 __entry->writeback_index = inode->i_mapping->writeback_index; 323 ), 324 325 + TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld more_io %d no_nrwrite_index_update %d writeback_index %lu", 326 jbd2_dev_to_name(__entry->dev), 327 (unsigned long) __entry->ino, __entry->ret, 328 __entry->pages_written, __entry->pages_skipped, 329 + __entry->more_io, 330 __entry->no_nrwrite_index_update, 331 (unsigned long) __entry->writeback_index) 332 ); ··· 591 592 TRACE_EVENT(ext4_free_blocks, 593 TP_PROTO(struct inode *inode, __u64 block, unsigned long count, 594 + int flags), 595 596 + TP_ARGS(inode, block, count, flags), 597 598 TP_STRUCT__entry( 599 __field( dev_t, dev ) 600 __field( ino_t, ino ) 601 + __field( umode_t, mode ) 602 __field( __u64, block ) 603 __field( unsigned long, count ) 604 + __field( int, flags ) 605 ), 606 607 TP_fast_assign( 608 __entry->dev = inode->i_sb->s_dev; 609 __entry->ino = inode->i_ino; 610 + __entry->mode = inode->i_mode; 611 __entry->block = block; 612 __entry->count = count; 613 + __entry->flags = flags; 614 ), 615 616 + TP_printk("dev %s ino %lu mode 0%o block %llu count %lu flags %d", 617 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, 618 + __entry->mode, __entry->block, __entry->count, 619 + __entry->flags) 620 ); 621 622 TRACE_EVENT(ext4_sync_file, ··· 846 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, 847 __entry->result_group, __entry->result_start, 848 __entry->result_len, __entry->result_logical) 849 + ); 850 + 851 + TRACE_EVENT(ext4_forget, 852 + TP_PROTO(struct inode *inode, int is_metadata, __u64 block), 853 + 854 + TP_ARGS(inode, is_metadata, block), 855 + 856 + TP_STRUCT__entry( 857 + __field( dev_t, dev ) 858 + __field( ino_t, ino ) 859 + __field( umode_t, mode ) 860 + __field( int, is_metadata ) 861 + __field( __u64, block ) 862 + ), 863 + 864 + TP_fast_assign( 865 + __entry->dev = inode->i_sb->s_dev; 866 + __entry->ino = inode->i_ino; 867 + __entry->mode = inode->i_mode; 868 + __entry->is_metadata = is_metadata; 869 + __entry->block = block; 870 + ), 871 + 872 + TP_printk("dev %s ino %lu mode 0%o is_metadata %d block %llu", 873 + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, 874 + __entry->mode, __entry->is_metadata, __entry->block) 875 ); 876 877 #endif /* _TRACE_EXT4_H */