commit 4515c3069da5bfb6f08dbfca499464b4cbdfcb50

+8 -2

Documentation/filesystems/ext4.txt

··· 153 153 identified through its new major/minor numbers encoded 154 154 in devnum. 155 155 156 - noload Don't load the journal on mounting. Note that 157 - if the filesystem was not unmounted cleanly, 156 + norecovery Don't load the journal on mounting. Note that 157 + noload if the filesystem was not unmounted cleanly, 158 158 skipping the journal replay will lead to the 159 159 filesystem containing inconsistencies that can 160 160 lead to any number of problems. ··· 352 352 "zero-length" problem that can happen when a 353 353 system crashes before the delayed allocation 354 354 blocks are forced to disk. 355 + 356 + discard Controls whether ext4 should issue discard/TRIM 357 + nodiscard(*) commands to the underlying block device when 358 + blocks are freed. This is useful for SSD devices 359 + and sparse/thinly-provisioned LUNs, but it is off 360 + by default until sufficient testing has been done. 355 361 356 362 Data Mode 357 363 =========

+10

fs/ext4/Kconfig

··· 26 26 27 27 If unsure, say N. 28 28 29 + config EXT4_USE_FOR_EXT23 30 + bool "Use ext4 for ext2/ext3 file systems" 31 + depends on EXT3_FS=n || EXT2_FS=n 32 + default y 33 + help 34 + Allow the ext4 file system driver code to be used for ext2 or 35 + ext3 file system mounts. This allows users to reduce their 36 + compiled kernel size by using one file system driver for 37 + ext2, ext3, and ext4 file systems. 38 + 29 39 config EXT4_FS_XATTR 30 40 bool "Ext4 extended attributes" 31 41 depends on EXT4_FS

+7 -39

fs/ext4/balloc.c

··· 499 499 } 500 500 501 501 /** 502 - * ext4_free_blocks() -- Free given blocks and update quota 503 - * @handle: handle for this transaction 504 - * @inode: inode 505 - * @block: start physical block to free 506 - * @count: number of blocks to count 507 - * @metadata: Are these metadata blocks 508 - */ 509 - void ext4_free_blocks(handle_t *handle, struct inode *inode, 510 - ext4_fsblk_t block, unsigned long count, 511 - int metadata) 512 - { 513 - struct super_block *sb; 514 - unsigned long dquot_freed_blocks; 515 - 516 - /* this isn't the right place to decide whether block is metadata 517 - * inode.c/extents.c knows better, but for safety ... */ 518 - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 519 - metadata = 1; 520 - 521 - /* We need to make sure we don't reuse 522 - * block released untill the transaction commit. 523 - * writeback mode have weak data consistency so 524 - * don't force data as metadata when freeing block 525 - * for writeback mode. 526 - */ 527 - if (metadata == 0 && !ext4_should_writeback_data(inode)) 528 - metadata = 1; 529 - 530 - sb = inode->i_sb; 531 - 532 - ext4_mb_free_blocks(handle, inode, block, count, 533 - metadata, &dquot_freed_blocks); 534 - if (dquot_freed_blocks) 535 - vfs_dq_free_block(inode, dquot_freed_blocks); 536 - return; 537 - } 538 - 539 - /** 540 502 * ext4_has_free_blocks() 541 503 * @sbi: in-core super block structure. 542 504 * @nblocks: number of needed blocks ··· 723 761 static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, 724 762 ext4_group_t group) 725 763 { 726 - return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0; 764 + if (!ext4_bg_has_super(sb, group)) 765 + return 0; 766 + 767 + if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG)) 768 + return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); 769 + else 770 + return EXT4_SB(sb)->s_gdb_count; 727 771 } 728 772 729 773 /**

+2 -1

fs/ext4/block_validity.c

··· 160 160 if (ext4_bg_has_super(sb, i) && 161 161 ((i < 5) || ((i % flex_size) == 0))) 162 162 add_system_zone(sbi, ext4_group_first_block_no(sb, i), 163 - sbi->s_gdb_count + 1); 163 + ext4_bg_num_gdb(sb, i) + 1); 164 164 gdp = ext4_get_group_desc(sb, i, NULL); 165 165 ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); 166 166 if (ret) ··· 228 228 struct rb_node *n = sbi->system_blks.rb_node; 229 229 230 230 if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || 231 + (start_blk + count < start_blk) || 231 232 (start_blk + count > ext4_blocks_count(sbi->s_es))) 232 233 return 0; 233 234 while (n) {

+17 -6

fs/ext4/ext4.h

··· 376 376 EXT4_GET_BLOCKS_DIO_CREATE_EXT) 377 377 378 378 /* 379 + * Flags used by ext4_free_blocks 380 + */ 381 + #define EXT4_FREE_BLOCKS_METADATA 0x0001 382 + #define EXT4_FREE_BLOCKS_FORGET 0x0002 383 + 384 + /* 379 385 * ioctl commands 380 386 */ 381 387 #define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS ··· 709 703 struct list_head i_aio_dio_complete_list; 710 704 /* current io_end structure for async DIO write*/ 711 705 ext4_io_end_t *cur_aio_dio; 706 + 707 + /* 708 + * Transactions that contain inode's metadata needed to complete 709 + * fsync and fdatasync, respectively. 710 + */ 711 + tid_t i_sync_tid; 712 + tid_t i_datasync_tid; 712 713 }; 713 714 714 715 /* ··· 763 750 #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 764 751 #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 765 752 #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ 753 + #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ 766 754 767 755 #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 768 756 #define set_opt(o, opt) o |= EXT4_MOUNT_##opt ··· 1338 1324 ext4_fsblk_t goal, unsigned long *count, int *errp); 1339 1325 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1340 1326 extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1341 - extern void ext4_free_blocks(handle_t *handle, struct inode *inode, 1342 - ext4_fsblk_t block, unsigned long count, int metadata); 1343 1327 extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, 1344 1328 ext4_fsblk_t block, unsigned long count); 1345 1329 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); ··· 1396 1384 extern void ext4_discard_preallocations(struct inode *); 1397 1385 extern int __init init_ext4_mballoc(void); 1398 1386 extern void exit_ext4_mballoc(void); 1399 - extern void ext4_mb_free_blocks(handle_t *, struct inode *, 1400 - ext4_fsblk_t, unsigned long, int, unsigned long *); 1387 + extern void ext4_free_blocks(handle_t *handle, struct inode *inode, 1388 + struct buffer_head *bh, ext4_fsblk_t block, 1389 + unsigned long count, int flags); 1401 1390 extern int ext4_mb_add_groupinfo(struct super_block *sb, 1402 1391 ext4_group_t i, struct ext4_group_desc *desc); 1403 1392 extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); 1404 1393 extern void ext4_mb_put_buddy_cache_lock(struct super_block *, 1405 1394 ext4_group_t, int); 1406 1395 /* inode.c */ 1407 - int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 1408 - struct buffer_head *bh, ext4_fsblk_t blocknr); 1409 1396 struct buffer_head *ext4_getblk(handle_t *, struct inode *, 1410 1397 ext4_lblk_t, int, int *); 1411 1398 struct buffer_head *ext4_bread(handle_t *, struct inode *,

+61 -25

fs/ext4/ext4_jbd2.c

··· 4 4 5 5 #include "ext4_jbd2.h" 6 6 7 + #include <trace/events/ext4.h> 8 + 7 9 int __ext4_journal_get_undo_access(const char *where, handle_t *handle, 8 10 struct buffer_head *bh) 9 11 { ··· 34 32 return err; 35 33 } 36 34 37 - int __ext4_journal_forget(const char *where, handle_t *handle, 38 - struct buffer_head *bh) 35 + /* 36 + * The ext4 forget function must perform a revoke if we are freeing data 37 + * which has been journaled. Metadata (eg. indirect blocks) must be 38 + * revoked in all cases. 39 + * 40 + * "bh" may be NULL: a metadata block may have been freed from memory 41 + * but there may still be a record of it in the journal, and that record 42 + * still needs to be revoked. 43 + * 44 + * If the handle isn't valid we're not journaling, but we still need to 45 + * call into ext4_journal_revoke() to put the buffer head. 46 + */ 47 + int __ext4_forget(const char *where, handle_t *handle, int is_metadata, 48 + struct inode *inode, struct buffer_head *bh, 49 + ext4_fsblk_t blocknr) 39 50 { 40 - int err = 0; 51 + int err; 41 52 42 - if (ext4_handle_valid(handle)) { 43 - err = jbd2_journal_forget(handle, bh); 44 - if (err) 45 - ext4_journal_abort_handle(where, __func__, bh, 46 - handle, err); 47 - } 48 - else 53 + might_sleep(); 54 + 55 + trace_ext4_forget(inode, is_metadata, blocknr); 56 + BUFFER_TRACE(bh, "enter"); 57 + 58 + jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " 59 + "data mode %x\n", 60 + bh, is_metadata, inode->i_mode, 61 + test_opt(inode->i_sb, DATA_FLAGS)); 62 + 63 + /* In the no journal case, we can just do a bforget and return */ 64 + if (!ext4_handle_valid(handle)) { 49 65 bforget(bh); 50 - return err; 51 - } 52 - 53 - int __ext4_journal_revoke(const char *where, handle_t *handle, 54 - ext4_fsblk_t blocknr, struct buffer_head *bh) 55 - { 56 - int err = 0; 57 - 58 - if (ext4_handle_valid(handle)) { 59 - err = jbd2_journal_revoke(handle, blocknr, bh); 60 - if (err) 61 - ext4_journal_abort_handle(where, __func__, bh, 62 - handle, err); 66 + return 0; 63 67 } 64 - else 65 - bforget(bh); 68 + 69 + /* Never use the revoke function if we are doing full data 70 + * journaling: there is no need to, and a V1 superblock won't 71 + * support it. Otherwise, only skip the revoke on un-journaled 72 + * data blocks. */ 73 + 74 + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || 75 + (!is_metadata && !ext4_should_journal_data(inode))) { 76 + if (bh) { 77 + BUFFER_TRACE(bh, "call jbd2_journal_forget"); 78 + err = jbd2_journal_forget(handle, bh); 79 + if (err) 80 + ext4_journal_abort_handle(where, __func__, bh, 81 + handle, err); 82 + return err; 83 + } 84 + return 0; 85 + } 86 + 87 + /* 88 + * data!=journal && (is_metadata || should_journal_data(inode)) 89 + */ 90 + BUFFER_TRACE(bh, "call jbd2_journal_revoke"); 91 + err = jbd2_journal_revoke(handle, blocknr, bh); 92 + if (err) { 93 + ext4_journal_abort_handle(where, __func__, bh, handle, err); 94 + ext4_abort(inode->i_sb, __func__, 95 + "error %d when attempting revoke", err); 96 + } 97 + BUFFER_TRACE(bh, "exit"); 66 98 return err; 67 99 } 68 100

+26 -18

fs/ext4/ext4_jbd2.h

··· 49 49 50 50 #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ 51 51 EXT4_XATTR_TRANS_BLOCKS - 2 + \ 52 - 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) 52 + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) 53 53 54 54 /* 55 55 * Define the number of metadata blocks we need to account to modify data. ··· 57 57 * This include super block, inode block, quota blocks and xattr blocks 58 58 */ 59 59 #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ 60 - 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) 60 + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) 61 61 62 62 /* Delete operations potentially hit one directory's namespace plus an 63 63 * entire inode, plus arbitrary amounts of bitmap/indirection data. Be ··· 92 92 * but inode, sb and group updates are done only once */ 93 93 #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ 94 94 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) 95 + 95 96 #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ 96 97 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) 97 98 #else ··· 100 99 #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 101 100 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0 102 101 #endif 102 + #define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) 103 + #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) 104 + #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) 103 105 104 106 int 105 107 ext4_mark_iloc_dirty(handle_t *handle, ··· 120 116 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); 121 117 122 118 /* 123 - * Wrapper functions with which ext4 calls into JBD. The intent here is 124 - * to allow these to be turned into appropriate stubs so ext4 can control 125 - * ext2 filesystems, so ext2+ext4 systems only nee one fs. This work hasn't 126 - * been done yet. 119 + * Wrapper functions with which ext4 calls into JBD. 127 120 */ 128 - 129 121 void ext4_journal_abort_handle(const char *caller, const char *err_fn, 130 122 struct buffer_head *bh, handle_t *handle, int err); 131 123 ··· 131 131 int __ext4_journal_get_write_access(const char *where, handle_t *handle, 132 132 struct buffer_head *bh); 133 133 134 - /* When called with an invalid handle, this will still do a put on the BH */ 135 - int __ext4_journal_forget(const char *where, handle_t *handle, 136 - struct buffer_head *bh); 137 - 138 - /* When called with an invalid handle, this will still do a put on the BH */ 139 - int __ext4_journal_revoke(const char *where, handle_t *handle, 140 - ext4_fsblk_t blocknr, struct buffer_head *bh); 134 + int __ext4_forget(const char *where, handle_t *handle, int is_metadata, 135 + struct inode *inode, struct buffer_head *bh, 136 + ext4_fsblk_t blocknr); 141 137 142 138 int __ext4_journal_get_create_access(const char *where, 143 139 handle_t *handle, struct buffer_head *bh); ··· 145 149 __ext4_journal_get_undo_access(__func__, (handle), (bh)) 146 150 #define ext4_journal_get_write_access(handle, bh) \ 147 151 __ext4_journal_get_write_access(__func__, (handle), (bh)) 148 - #define ext4_journal_revoke(handle, blocknr, bh) \ 149 - __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) 152 + #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ 153 + __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\ 154 + (block_nr)) 150 155 #define ext4_journal_get_create_access(handle, bh) \ 151 156 __ext4_journal_get_create_access(__func__, (handle), (bh)) 152 - #define ext4_journal_forget(handle, bh) \ 153 - __ext4_journal_forget(__func__, (handle), (bh)) 154 157 #define ext4_handle_dirty_metadata(handle, inode, bh) \ 155 158 __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh)) 156 159 ··· 247 252 if (ext4_handle_valid(handle)) 248 253 return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); 249 254 return 0; 255 + } 256 + 257 + static inline void ext4_update_inode_fsync_trans(handle_t *handle, 258 + struct inode *inode, 259 + int datasync) 260 + { 261 + struct ext4_inode_info *ei = EXT4_I(inode); 262 + 263 + if (ext4_handle_valid(handle)) { 264 + ei->i_sync_tid = handle->h_transaction->t_tid; 265 + if (datasync) 266 + ei->i_datasync_tid = handle->h_transaction->t_tid; 267 + } 250 268 } 251 269 252 270 /* super.c */

+24 -20

fs/ext4/extents.c

··· 1007 1007 for (i = 0; i < depth; i++) { 1008 1008 if (!ablocks[i]) 1009 1009 continue; 1010 - ext4_free_blocks(handle, inode, ablocks[i], 1, 1); 1010 + ext4_free_blocks(handle, inode, 0, ablocks[i], 1, 1011 + EXT4_FREE_BLOCKS_METADATA); 1011 1012 } 1012 1013 } 1013 1014 kfree(ablocks); ··· 1762 1761 while (block < last && block != EXT_MAX_BLOCK) { 1763 1762 num = last - block; 1764 1763 /* find extent for this block */ 1764 + down_read(&EXT4_I(inode)->i_data_sem); 1765 1765 path = ext4_ext_find_extent(inode, block, path); 1766 + up_read(&EXT4_I(inode)->i_data_sem); 1766 1767 if (IS_ERR(path)) { 1767 1768 err = PTR_ERR(path); 1768 1769 path = NULL; ··· 1960 1957 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 1961 1958 struct ext4_ext_path *path) 1962 1959 { 1963 - struct buffer_head *bh; 1964 1960 int err; 1965 1961 ext4_fsblk_t leaf; 1966 1962 ··· 1975 1973 if (err) 1976 1974 return err; 1977 1975 ext_debug("index is empty, remove it, free block %llu\n", leaf); 1978 - bh = sb_find_get_block(inode->i_sb, leaf); 1979 - ext4_forget(handle, 1, inode, bh, leaf); 1980 - ext4_free_blocks(handle, inode, leaf, 1, 1); 1976 + ext4_free_blocks(handle, inode, 0, leaf, 1, 1977 + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 1981 1978 return err; 1982 1979 } 1983 1980 ··· 2043 2042 struct ext4_extent *ex, 2044 2043 ext4_lblk_t from, ext4_lblk_t to) 2045 2044 { 2046 - struct buffer_head *bh; 2047 2045 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2048 - int i, metadata = 0; 2046 + int flags = EXT4_FREE_BLOCKS_FORGET; 2049 2047 2050 2048 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2051 - metadata = 1; 2049 + flags |= EXT4_FREE_BLOCKS_METADATA; 2052 2050 #ifdef EXTENTS_STATS 2053 2051 { 2054 2052 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ··· 2072 2072 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2073 2073 start = ext_pblock(ex) + ee_len - num; 2074 2074 ext_debug("free last %u blocks starting %llu\n", num, start); 2075 - for (i = 0; i < num; i++) { 2076 - bh = sb_find_get_block(inode->i_sb, start + i); 2077 - ext4_forget(handle, 0, inode, bh, start + i); 2078 - } 2079 - ext4_free_blocks(handle, inode, start, num, metadata); 2075 + ext4_free_blocks(handle, inode, 0, start, num, flags); 2080 2076 } else if (from == le32_to_cpu(ex->ee_block) 2081 2077 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2082 2078 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", ··· 2163 2167 correct_index = 1; 2164 2168 credits += (ext_depth(inode)) + 1; 2165 2169 } 2166 - credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 2170 + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); 2167 2171 2168 2172 err = ext4_ext_truncate_extend_restart(handle, inode, credits); 2169 2173 if (err) ··· 3060 3064 if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { 3061 3065 ret = ext4_convert_unwritten_extents_dio(handle, inode, 3062 3066 path); 3067 + if (ret >= 0) 3068 + ext4_update_inode_fsync_trans(handle, inode, 1); 3063 3069 goto out2; 3064 3070 } 3065 3071 /* buffered IO case */ ··· 3089 3091 ret = ext4_ext_convert_to_initialized(handle, inode, 3090 3092 path, iblock, 3091 3093 max_blocks); 3094 + if (ret >= 0) 3095 + ext4_update_inode_fsync_trans(handle, inode, 1); 3092 3096 out: 3093 3097 if (ret <= 0) { 3094 3098 err = ret; ··· 3319 3319 /* not a good idea to call discard here directly, 3320 3320 * but otherwise we'd need to call it every free() */ 3321 3321 ext4_discard_preallocations(inode); 3322 - ext4_free_blocks(handle, inode, ext_pblock(&newex), 3323 - ext4_ext_get_actual_len(&newex), 0); 3322 + ext4_free_blocks(handle, inode, 0, ext_pblock(&newex), 3323 + ext4_ext_get_actual_len(&newex), 0); 3324 3324 goto out2; 3325 3325 } 3326 3326 ··· 3329 3329 allocated = ext4_ext_get_actual_len(&newex); 3330 3330 set_buffer_new(bh_result); 3331 3331 3332 - /* Cache only when it is _not_ an uninitialized extent */ 3333 - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) 3332 + /* 3333 + * Cache the extent and update transaction to commit on fdatasync only 3334 + * when it is _not_ an uninitialized extent. 3335 + */ 3336 + if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { 3334 3337 ext4_ext_put_in_cache(inode, iblock, allocated, newblock, 3335 3338 EXT4_EXT_CACHE_EXTENT); 3339 + ext4_update_inode_fsync_trans(handle, inode, 1); 3340 + } else 3341 + ext4_update_inode_fsync_trans(handle, inode, 0); 3336 3342 out: 3337 3343 if (allocated > max_blocks) 3338 3344 allocated = max_blocks; ··· 3726 3720 * Walk the extent tree gathering extent information. 3727 3721 * ext4_ext_fiemap_cb will push extents back to user. 3728 3722 */ 3729 - down_read(&EXT4_I(inode)->i_data_sem); 3730 3723 error = ext4_ext_walk_space(inode, start_blk, len_blks, 3731 3724 ext4_ext_fiemap_cb, fieinfo); 3732 - up_read(&EXT4_I(inode)->i_data_sem); 3733 3725 } 3734 3726 3735 3727 return error;

+20 -34

fs/ext4/fsync.c

··· 51 51 int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) 52 52 { 53 53 struct inode *inode = dentry->d_inode; 54 + struct ext4_inode_info *ei = EXT4_I(inode); 54 55 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 55 - int err, ret = 0; 56 + int ret; 57 + tid_t commit_tid; 56 58 57 59 J_ASSERT(ext4_journal_current_handle() == NULL); 58 60 59 61 trace_ext4_sync_file(file, dentry, datasync); 60 62 63 + if (inode->i_sb->s_flags & MS_RDONLY) 64 + return 0; 65 + 61 66 ret = flush_aio_dio_completed_IO(inode); 62 67 if (ret < 0) 63 - goto out; 68 + return ret; 69 + 70 + if (!journal) 71 + return simple_fsync(file, dentry, datasync); 72 + 64 73 /* 65 - * data=writeback: 74 + * data=writeback,ordered: 66 75 * The caller's filemap_fdatawrite()/wait will sync the data. 67 - * sync_inode() will sync the metadata 68 - * 69 - * data=ordered: 70 - * The caller's filemap_fdatawrite() will write the data and 71 - * sync_inode() will write the inode if it is dirty. Then the caller's 72 - * filemap_fdatawait() will wait on the pages. 76 + * Metadata is in the journal, we wait for proper transaction to 77 + * commit here. 73 78 * 74 79 * data=journal: 75 80 * filemap_fdatawrite won't do anything (the buffers are clean). ··· 84 79 * (they were dirtied by commit). But that's OK - the blocks are 85 80 * safe in-journal, which is all fsync() needs to ensure. 86 81 */ 87 - if (ext4_should_journal_data(inode)) { 88 - ret = ext4_force_commit(inode->i_sb); 89 - goto out; 90 - } 82 + if (ext4_should_journal_data(inode)) 83 + return ext4_force_commit(inode->i_sb); 91 84 92 - if (!journal) 93 - ret = sync_mapping_buffers(inode->i_mapping); 94 - 95 - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 96 - goto out; 97 - 98 - /* 99 - * The VFS has written the file data. If the inode is unaltered 100 - * then we need not start a commit. 101 - */ 102 - if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { 103 - struct writeback_control wbc = { 104 - .sync_mode = WB_SYNC_ALL, 105 - .nr_to_write = 0, /* sys_fsync did this */ 106 - }; 107 - err = sync_inode(inode, &wbc); 108 - if (ret == 0) 109 - ret = err; 110 - } 111 - out: 112 - if (journal && (journal->j_flags & JBD2_BARRIER)) 85 + commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; 86 + if (jbd2_log_start_commit(journal, commit_tid)) 87 + jbd2_log_wait_commit(journal, commit_tid); 88 + else if (journal->j_flags & JBD2_BARRIER) 113 89 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 114 90 return ret; 115 91 }

+85 -108

fs/ext4/inode.c

··· 71 71 } 72 72 73 73 /* 74 - * The ext4 forget function must perform a revoke if we are freeing data 75 - * which has been journaled. Metadata (eg. indirect blocks) must be 76 - * revoked in all cases. 77 - * 78 - * "bh" may be NULL: a metadata block may have been freed from memory 79 - * but there may still be a record of it in the journal, and that record 80 - * still needs to be revoked. 81 - * 82 - * If the handle isn't valid we're not journaling, but we still need to 83 - * call into ext4_journal_revoke() to put the buffer head. 84 - */ 85 - int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 86 - struct buffer_head *bh, ext4_fsblk_t blocknr) 87 - { 88 - int err; 89 - 90 - might_sleep(); 91 - 92 - BUFFER_TRACE(bh, "enter"); 93 - 94 - jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " 95 - "data mode %x\n", 96 - bh, is_metadata, inode->i_mode, 97 - test_opt(inode->i_sb, DATA_FLAGS)); 98 - 99 - /* Never use the revoke function if we are doing full data 100 - * journaling: there is no need to, and a V1 superblock won't 101 - * support it. Otherwise, only skip the revoke on un-journaled 102 - * data blocks. */ 103 - 104 - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || 105 - (!is_metadata && !ext4_should_journal_data(inode))) { 106 - if (bh) { 107 - BUFFER_TRACE(bh, "call jbd2_journal_forget"); 108 - return ext4_journal_forget(handle, bh); 109 - } 110 - return 0; 111 - } 112 - 113 - /* 114 - * data!=journal && (is_metadata || should_journal_data(inode)) 115 - */ 116 - BUFFER_TRACE(bh, "call ext4_journal_revoke"); 117 - err = ext4_journal_revoke(handle, blocknr, bh); 118 - if (err) 119 - ext4_abort(inode->i_sb, __func__, 120 - "error %d when attempting revoke", err); 121 - BUFFER_TRACE(bh, "exit"); 122 - return err; 123 - } 124 - 125 - /* 126 74 * Work out how many blocks we need to proceed with the next chunk of a 127 75 * truncate transaction. 128 76 */ ··· 669 721 return ret; 670 722 failed_out: 671 723 for (i = 0; i < index; i++) 672 - ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 724 + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); 673 725 return ret; 674 726 } 675 727 ··· 765 817 return err; 766 818 failed: 767 819 /* Allocation failed, free what we already allocated */ 820 + ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); 768 821 for (i = 1; i <= n ; i++) { 769 - BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); 770 - ext4_journal_forget(handle, branch[i].bh); 822 + /* 823 + * branch[i].bh is newly allocated, so there is no 824 + * need to revoke the block, which is why we don't 825 + * need to set EXT4_FREE_BLOCKS_METADATA. 826 + */ 827 + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 828 + EXT4_FREE_BLOCKS_FORGET); 771 829 } 772 - for (i = 0; i < indirect_blks; i++) 773 - ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 830 + for (i = n+1; i < indirect_blks; i++) 831 + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); 774 832 775 - ext4_free_blocks(handle, inode, new_blocks[i], num, 0); 833 + ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0); 776 834 777 835 return err; 778 836 } ··· 857 903 858 904 err_out: 859 905 for (i = 1; i <= num; i++) { 860 - BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); 861 - ext4_journal_forget(handle, where[i].bh); 862 - ext4_free_blocks(handle, inode, 863 - le32_to_cpu(where[i-1].key), 1, 0); 906 + /* 907 + * branch[i].bh is newly allocated, so there is no 908 + * need to revoke the block, which is why we don't 909 + * need to set EXT4_FREE_BLOCKS_METADATA. 910 + */ 911 + ext4_free_blocks(handle, inode, where[i].bh, 0, 1, 912 + EXT4_FREE_BLOCKS_FORGET); 864 913 } 865 - ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0); 914 + ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key), 915 + blks, 0); 866 916 867 917 return err; 868 918 } ··· 979 1021 if (!err) 980 1022 err = ext4_splice_branch(handle, inode, iblock, 981 1023 partial, indirect_blks, count); 982 - else 1024 + if (err) 983 1025 goto cleanup; 984 1026 985 1027 set_buffer_new(bh_result); 1028 + 1029 + ext4_update_inode_fsync_trans(handle, inode, 1); 986 1030 got_it: 987 1031 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 988 1032 if (count > blocks_to_boundary) ··· 1012 1052 EXT4_I(inode)->i_reserved_meta_blocks; 1013 1053 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1014 1054 1015 - return total; 1055 + return (total << inode->i_blkbits); 1016 1056 } 1017 1057 /* 1018 1058 * Calculate the number of metadata blocks need to reserve ··· 1494 1534 return ext4_journal_get_write_access(handle, bh); 1495 1535 } 1496 1536 1537 + /* 1538 + * Truncate blocks that were not used by write. We have to truncate the 1539 + * pagecache as well so that corresponding buffers get properly unmapped. 1540 + */ 1541 + static void ext4_truncate_failed_write(struct inode *inode) 1542 + { 1543 + truncate_inode_pages(inode->i_mapping, inode->i_size); 1544 + ext4_truncate(inode); 1545 + } 1546 + 1497 1547 static int ext4_write_begin(struct file *file, struct address_space *mapping, 1498 1548 loff_t pos, unsigned len, unsigned flags, 1499 1549 struct page **pagep, void **fsdata) ··· 1569 1599 1570 1600 ext4_journal_stop(handle); 1571 1601 if (pos + len > inode->i_size) { 1572 - ext4_truncate(inode); 1602 + ext4_truncate_failed_write(inode); 1573 1603 /* 1574 1604 * If truncate failed early the inode might 1575 1605 * still be on the orphan list; we need to ··· 1679 1709 ret = ret2; 1680 1710 1681 1711 if (pos + len > inode->i_size) { 1682 - ext4_truncate(inode); 1712 + ext4_truncate_failed_write(inode); 1683 1713 /* 1684 1714 * If truncate failed early the inode might still be 1685 1715 * on the orphan list; we need to make sure the inode ··· 1721 1751 ret = ret2; 1722 1752 1723 1753 if (pos + len > inode->i_size) { 1724 - ext4_truncate(inode); 1754 + ext4_truncate_failed_write(inode); 1725 1755 /* 1726 1756 * If truncate failed early the inode might still be 1727 1757 * on the orphan list; we need to make sure the inode ··· 1784 1814 if (!ret) 1785 1815 ret = ret2; 1786 1816 if (pos + len > inode->i_size) { 1787 - ext4_truncate(inode); 1817 + ext4_truncate_failed_write(inode); 1788 1818 /* 1789 1819 * If truncate failed early the inode might still be 1790 1820 * on the orphan list; we need to make sure the inode ··· 2570 2600 } 2571 2601 2572 2602 static int __ext4_journalled_writepage(struct page *page, 2573 - struct writeback_control *wbc, 2574 2603 unsigned int len) 2575 2604 { 2576 2605 struct address_space *mapping = page->mapping; ··· 2727 2758 * doesn't seem much point in redirtying the page here. 2728 2759 */ 2729 2760 ClearPageChecked(page); 2730 - return __ext4_journalled_writepage(page, wbc, len); 2761 + return __ext4_journalled_writepage(page, len); 2731 2762 } 2732 2763 2733 2764 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) ··· 2757 2788 * number of contiguous block. So we will limit 2758 2789 * number of contiguous block to a sane value 2759 2790 */ 2760 - if (!(inode->i_flags & EXT4_EXTENTS_FL) && 2791 + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) && 2761 2792 (max_blocks > EXT4_MAX_TRANS_DATA)) 2762 2793 max_blocks = EXT4_MAX_TRANS_DATA; 2763 2794 ··· 3060 3091 * i_size_read because we hold i_mutex. 3061 3092 */ 3062 3093 if (pos + len > inode->i_size) 3063 - ext4_truncate(inode); 3094 + ext4_truncate_failed_write(inode); 3064 3095 } 3065 3096 3066 3097 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) ··· 4089 4120 __le32 *last) 4090 4121 { 4091 4122 __le32 *p; 4123 + int flags = EXT4_FREE_BLOCKS_FORGET; 4124 + 4125 + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 4126 + flags |= EXT4_FREE_BLOCKS_METADATA; 4127 + 4092 4128 if (try_to_extend_transaction(handle, inode)) { 4093 4129 if (bh) { 4094 4130 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); ··· 4108 4134 } 4109 4135 } 4110 4136 4111 - /* 4112 - * Any buffers which are on the journal will be in memory. We 4113 - * find them on the hash table so jbd2_journal_revoke() will 4114 - * run jbd2_journal_forget() on them. We've already detached 4115 - * each block from the file, so bforget() in 4116 - * jbd2_journal_forget() should be safe. 4117 - * 4118 - * AKPM: turn on bforget in jbd2_journal_forget()!!! 4119 - */ 4120 - for (p = first; p < last; p++) { 4121 - u32 nr = le32_to_cpu(*p); 4122 - if (nr) { 4123 - struct buffer_head *tbh; 4137 + for (p = first; p < last; p++) 4138 + *p = 0; 4124 4139 4125 - *p = 0; 4126 - tbh = sb_find_get_block(inode->i_sb, nr); 4127 - ext4_forget(handle, 0, inode, tbh, nr); 4128 - } 4129 - } 4130 - 4131 - ext4_free_blocks(handle, inode, block_to_free, count, 0); 4140 + ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); 4132 4141 } 4133 4142 4134 4143 /** ··· 4299 4342 blocks_for_truncate(inode)); 4300 4343 } 4301 4344 4302 - ext4_free_blocks(handle, inode, nr, 1, 1); 4345 + ext4_free_blocks(handle, inode, 0, nr, 1, 4346 + EXT4_FREE_BLOCKS_METADATA); 4303 4347 4304 4348 if (parent_bh) { 4305 4349 /* ··· 4739 4781 struct ext4_iloc iloc; 4740 4782 struct ext4_inode *raw_inode; 4741 4783 struct ext4_inode_info *ei; 4742 - struct buffer_head *bh; 4743 4784 struct inode *inode; 4785 + journal_t *journal = EXT4_SB(sb)->s_journal; 4744 4786 long ret; 4745 4787 int block; 4746 4788 ··· 4751 4793 return inode; 4752 4794 4753 4795 ei = EXT4_I(inode); 4796 + iloc.bh = 0; 4754 4797 4755 4798 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4756 4799 if (ret < 0) 4757 4800 goto bad_inode; 4758 - bh = iloc.bh; 4759 4801 raw_inode = ext4_raw_inode(&iloc); 4760 4802 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4761 4803 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); ··· 4778 4820 if (inode->i_mode == 0 || 4779 4821 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 4780 4822 /* this inode is deleted */ 4781 - brelse(bh); 4782 4823 ret = -ESTALE; 4783 4824 goto bad_inode; 4784 4825 } ··· 4805 4848 ei->i_data[block] = raw_inode->i_block[block]; 4806 4849 INIT_LIST_HEAD(&ei->i_orphan); 4807 4850 4851 + /* 4852 + * Set transaction id's of transactions that have to be committed 4853 + * to finish f[data]sync. We set them to currently running transaction 4854 + * as we cannot be sure that the inode or some of its metadata isn't 4855 + * part of the transaction - the inode could have been reclaimed and 4856 + * now it is reread from disk. 4857 + */ 4858 + if (journal) { 4859 + transaction_t *transaction; 4860 + tid_t tid; 4861 + 4862 + spin_lock(&journal->j_state_lock); 4863 + if (journal->j_running_transaction) 4864 + transaction = journal->j_running_transaction; 4865 + else 4866 + transaction = journal->j_committing_transaction; 4867 + if (transaction) 4868 + tid = transaction->t_tid; 4869 + else 4870 + tid = journal->j_commit_sequence; 4871 + spin_unlock(&journal->j_state_lock); 4872 + ei->i_sync_tid = tid; 4873 + ei->i_datasync_tid = tid; 4874 + } 4875 + 4808 4876 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4809 4877 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 4810 4878 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 4811 4879 EXT4_INODE_SIZE(inode->i_sb)) { 4812 - brelse(bh); 4813 4880 ret = -EIO; 4814 4881 goto bad_inode; 4815 4882 } ··· 4865 4884 4866 4885 ret = 0; 4867 4886 if (ei->i_file_acl && 4868 - ((ei->i_file_acl < 4869 - (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + 4870 - EXT4_SB(sb)->s_gdb_count)) || 4871 - (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) { 4887 + !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { 4872 4888 ext4_error(sb, __func__, 4873 4889 "bad extended attribute block %llu in inode #%lu", 4874 4890 ei->i_file_acl, inode->i_ino); ··· 4883 4905 /* Validate block references which are part of inode */ 4884 4906 ret = ext4_check_inode_blockref(inode); 4885 4907 } 4886 - if (ret) { 4887 - brelse(bh); 4908 + if (ret) 4888 4909 goto bad_inode; 4889 - } 4890 4910 4891 4911 if (S_ISREG(inode->i_mode)) { 4892 4912 inode->i_op = &ext4_file_inode_operations; ··· 4912 4936 init_special_inode(inode, inode->i_mode, 4913 4937 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4914 4938 } else { 4915 - brelse(bh); 4916 4939 ret = -EIO; 4917 4940 ext4_error(inode->i_sb, __func__, 4918 4941 "bogus i_mode (%o) for inode=%lu", ··· 4924 4949 return inode; 4925 4950 4926 4951 bad_inode: 4952 + brelse(iloc.bh); 4927 4953 iget_failed(inode); 4928 4954 return ERR_PTR(ret); 4929 4955 } ··· 5084 5108 err = rc; 5085 5109 ei->i_state &= ~EXT4_STATE_NEW; 5086 5110 5111 + ext4_update_inode_fsync_trans(handle, inode, 0); 5087 5112 out_brelse: 5088 5113 brelse(bh); 5089 5114 ext4_std_error(inode->i_sb, err); ··· 5204 5227 5205 5228 /* (user+group)*(old+new) structure, inode write (sb, 5206 5229 * inode block, ? - but truncate inode update has it) */ 5207 - handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ 5208 - EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); 5230 + handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ 5231 + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); 5209 5232 if (IS_ERR(handle)) { 5210 5233 error = PTR_ERR(handle); 5211 5234 goto err_out;

+18 -11

fs/ext4/ioctl.c

··· 221 221 struct file *donor_filp; 222 222 int err; 223 223 224 + if (!(filp->f_mode & FMODE_READ) || 225 + !(filp->f_mode & FMODE_WRITE)) 226 + return -EBADF; 227 + 224 228 if (copy_from_user(&me, 225 229 (struct move_extent __user *)arg, sizeof(me))) 226 230 return -EFAULT; 231 + me.moved_len = 0; 227 232 228 233 donor_filp = fget(me.donor_fd); 229 234 if (!donor_filp) 230 235 return -EBADF; 231 236 232 - if (!capable(CAP_DAC_OVERRIDE)) { 233 - if ((current->real_cred->fsuid != inode->i_uid) || 234 - !(inode->i_mode & S_IRUSR) || 235 - !(donor_filp->f_dentry->d_inode->i_mode & 236 - S_IRUSR)) { 237 - fput(donor_filp); 238 - return -EACCES; 239 - } 237 + if (!(donor_filp->f_mode & FMODE_WRITE)) { 238 + err = -EBADF; 239 + goto mext_out; 240 240 } 241 + 242 + err = mnt_want_write(filp->f_path.mnt); 243 + if (err) 244 + goto mext_out; 241 245 242 246 err = ext4_move_extents(filp, donor_filp, me.orig_start, 243 247 me.donor_start, me.len, &me.moved_len); 244 - fput(donor_filp); 248 + mnt_drop_write(filp->f_path.mnt); 249 + if (me.moved_len > 0) 250 + file_remove_suid(donor_filp); 245 251 246 252 if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) 247 - return -EFAULT; 248 - 253 + err = -EFAULT; 254 + mext_out: 255 + fput(donor_filp); 249 256 return err; 250 257 } 251 258

+80 -19

fs/ext4/mballoc.c

··· 2529 2529 struct ext4_group_info *db; 2530 2530 int err, count = 0, count2 = 0; 2531 2531 struct ext4_free_data *entry; 2532 - ext4_fsblk_t discard_block; 2533 2532 struct list_head *l, *ltmp; 2534 2533 2535 2534 list_for_each_safe(l, ltmp, &txn->t_private_list) { ··· 2558 2559 page_cache_release(e4b.bd_bitmap_page); 2559 2560 } 2560 2561 ext4_unlock_group(sb, entry->group); 2561 - discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) 2562 - + entry->start_blk 2563 - + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 2564 - trace_ext4_discard_blocks(sb, (unsigned long long)discard_block, 2565 - entry->count); 2566 - sb_issue_discard(sb, discard_block, entry->count); 2562 + if (test_opt(sb, DISCARD)) { 2563 + ext4_fsblk_t discard_block; 2564 + struct ext4_super_block *es = EXT4_SB(sb)->s_es; 2567 2565 2566 + discard_block = (ext4_fsblk_t)entry->group * 2567 + EXT4_BLOCKS_PER_GROUP(sb) 2568 + + entry->start_blk 2569 + + le32_to_cpu(es->s_first_data_block); 2570 + trace_ext4_discard_blocks(sb, 2571 + (unsigned long long)discard_block, 2572 + entry->count); 2573 + sb_issue_discard(sb, discard_block, entry->count); 2574 + } 2568 2575 kmem_cache_free(ext4_free_ext_cachep, entry); 2569 2576 ext4_mb_release_desc(&e4b); 2570 2577 } ··· 3008 3003 trace_ext4_mballoc_alloc(ac); 3009 3004 else 3010 3005 trace_ext4_mballoc_prealloc(ac); 3006 + } 3007 + 3008 + /* 3009 + * Called on failure; free up any blocks from the inode PA for this 3010 + * context. We don't need this for MB_GROUP_PA because we only change 3011 + * pa_free in ext4_mb_release_context(), but on failure, we've already 3012 + * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. 3013 + */ 3014 + static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) 3015 + { 3016 + struct ext4_prealloc_space *pa = ac->ac_pa; 3017 + int len; 3018 + 3019 + if (pa && pa->pa_type == MB_INODE_PA) { 3020 + len = ac->ac_b_ex.fe_len; 3021 + pa->pa_free += len; 3022 + } 3023 + 3011 3024 } 3012 3025 3013 3026 /* ··· 4313 4290 ac->ac_status = AC_STATUS_CONTINUE; 4314 4291 goto repeat; 4315 4292 } else if (*errp) { 4293 + ext4_discard_allocated_blocks(ac); 4316 4294 ac->ac_b_ex.fe_len = 0; 4317 4295 ar->len = 0; 4318 4296 ext4_mb_show_ac(ac); ··· 4446 4422 return 0; 4447 4423 } 4448 4424 4449 - /* 4450 - * Main entry point into mballoc to free blocks 4425 + /** 4426 + * ext4_free_blocks() -- Free given blocks and update quota 4427 + * @handle: handle for this transaction 4428 + * @inode: inode 4429 + * @block: start physical block to free 4430 + * @count: number of blocks to count 4431 + * @metadata: Are these metadata blocks 4451 4432 */ 4452 - void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, 4453 - ext4_fsblk_t block, unsigned long count, 4454 - int metadata, unsigned long *freed) 4433 + void ext4_free_blocks(handle_t *handle, struct inode *inode, 4434 + struct buffer_head *bh, ext4_fsblk_t block, 4435 + unsigned long count, int flags) 4455 4436 { 4456 4437 struct buffer_head *bitmap_bh = NULL; 4457 4438 struct super_block *sb = inode->i_sb; 4458 4439 struct ext4_allocation_context *ac = NULL; 4459 4440 struct ext4_group_desc *gdp; 4460 4441 struct ext4_super_block *es; 4442 + unsigned long freed = 0; 4461 4443 unsigned int overflow; 4462 4444 ext4_grpblk_t bit; 4463 4445 struct buffer_head *gd_bh; ··· 4473 4443 int err = 0; 4474 4444 int ret; 4475 4445 4476 - *freed = 0; 4446 + if (bh) { 4447 + if (block) 4448 + BUG_ON(block != bh->b_blocknr); 4449 + else 4450 + block = bh->b_blocknr; 4451 + } 4477 4452 4478 4453 sbi = EXT4_SB(sb); 4479 4454 es = EXT4_SB(sb)->s_es; 4480 - if (block < le32_to_cpu(es->s_first_data_block) || 4481 - block + count < block || 4482 - block + count > ext4_blocks_count(es)) { 4455 + if (!ext4_data_block_valid(sbi, block, count)) { 4483 4456 ext4_error(sb, __func__, 4484 4457 "Freeing blocks not in datazone - " 4485 4458 "block = %llu, count = %lu", block, count); ··· 4490 4457 } 4491 4458 4492 4459 ext4_debug("freeing block %llu\n", block); 4493 - trace_ext4_free_blocks(inode, block, count, metadata); 4460 + trace_ext4_free_blocks(inode, block, count, flags); 4461 + 4462 + if (flags & EXT4_FREE_BLOCKS_FORGET) { 4463 + struct buffer_head *tbh = bh; 4464 + int i; 4465 + 4466 + BUG_ON(bh && (count > 1)); 4467 + 4468 + for (i = 0; i < count; i++) { 4469 + if (!bh) 4470 + tbh = sb_find_get_block(inode->i_sb, 4471 + block + i); 4472 + ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4473 + inode, tbh, block + i); 4474 + } 4475 + } 4476 + 4477 + /* 4478 + * We need to make sure we don't reuse the freed block until 4479 + * after the transaction is committed, which we can do by 4480 + * treating the block as metadata, below. We make an 4481 + * exception if the inode is to be written in writeback mode 4482 + * since writeback mode has weak data consistency guarantees. 4483 + */ 4484 + if (!ext4_should_writeback_data(inode)) 4485 + flags |= EXT4_FREE_BLOCKS_METADATA; 4494 4486 4495 4487 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4496 4488 if (ac) { ··· 4591 4533 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4592 4534 if (err) 4593 4535 goto error_return; 4594 - if (metadata && ext4_handle_valid(handle)) { 4536 + 4537 + if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) { 4595 4538 struct ext4_free_data *new_entry; 4596 4539 /* 4597 4540 * blocks being freed are metadata. these blocks shouldn't ··· 4631 4572 4632 4573 ext4_mb_release_desc(&e4b); 4633 4574 4634 - *freed += count; 4575 + freed += count; 4635 4576 4636 4577 /* We dirtied the bitmap block */ 4637 4578 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); ··· 4651 4592 } 4652 4593 sb->s_dirt = 1; 4653 4594 error_return: 4595 + if (freed) 4596 + vfs_dq_free_block(inode, freed); 4654 4597 brelse(bitmap_bh); 4655 4598 ext4_std_error(sb, err); 4656 4599 if (ac)

+18 -9

fs/ext4/migrate.c

··· 238 238 * So allocate a credit of 3. We may update 239 239 * quota (user and group). 240 240 */ 241 - needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 241 + needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); 242 242 243 243 if (ext4_journal_extend(handle, needed) != 0) 244 244 retval = ext4_journal_restart(handle, needed); ··· 262 262 for (i = 0; i < max_entries; i++) { 263 263 if (tmp_idata[i]) { 264 264 extend_credit_for_blkdel(handle, inode); 265 - ext4_free_blocks(handle, inode, 266 - le32_to_cpu(tmp_idata[i]), 1, 1); 265 + ext4_free_blocks(handle, inode, 0, 266 + le32_to_cpu(tmp_idata[i]), 1, 267 + EXT4_FREE_BLOCKS_METADATA | 268 + EXT4_FREE_BLOCKS_FORGET); 267 269 } 268 270 } 269 271 put_bh(bh); 270 272 extend_credit_for_blkdel(handle, inode); 271 - ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); 273 + ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, 274 + EXT4_FREE_BLOCKS_METADATA | 275 + EXT4_FREE_BLOCKS_FORGET); 272 276 return 0; 273 277 } 274 278 ··· 301 297 } 302 298 put_bh(bh); 303 299 extend_credit_for_blkdel(handle, inode); 304 - ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); 300 + ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, 301 + EXT4_FREE_BLOCKS_METADATA | 302 + EXT4_FREE_BLOCKS_FORGET); 305 303 return 0; 306 304 } 307 305 ··· 314 308 /* ei->i_data[EXT4_IND_BLOCK] */ 315 309 if (i_data[0]) { 316 310 extend_credit_for_blkdel(handle, inode); 317 - ext4_free_blocks(handle, inode, 318 - le32_to_cpu(i_data[0]), 1, 1); 311 + ext4_free_blocks(handle, inode, 0, 312 + le32_to_cpu(i_data[0]), 1, 313 + EXT4_FREE_BLOCKS_METADATA | 314 + EXT4_FREE_BLOCKS_FORGET); 319 315 } 320 316 321 317 /* ei->i_data[EXT4_DIND_BLOCK] */ ··· 427 419 } 428 420 put_bh(bh); 429 421 extend_credit_for_blkdel(handle, inode); 430 - ext4_free_blocks(handle, inode, block, 1, 1); 422 + ext4_free_blocks(handle, inode, 0, block, 1, 423 + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 431 424 return retval; 432 425 } 433 426 ··· 486 477 handle = ext4_journal_start(inode, 487 478 EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 488 479 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 489 - 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb) 480 + EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) 490 481 + 1); 491 482 if (IS_ERR(handle)) { 492 483 retval = PTR_ERR(handle);

+133 -149

fs/ext4/move_extent.c

··· 77 77 mext_next_extent(struct inode *inode, struct ext4_ext_path *path, 78 78 struct ext4_extent **extent) 79 79 { 80 + struct ext4_extent_header *eh; 80 81 int ppos, leaf_ppos = path->p_depth; 81 82 82 83 ppos = leaf_ppos; 83 84 if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { 84 85 /* leaf block */ 85 86 *extent = ++path[ppos].p_ext; 87 + path[ppos].p_block = ext_pblock(path[ppos].p_ext); 86 88 return 0; 87 89 } 88 90 ··· 121 119 ext_block_hdr(path[cur_ppos+1].p_bh); 122 120 } 123 121 122 + path[leaf_ppos].p_ext = *extent = NULL; 123 + 124 + eh = path[leaf_ppos].p_hdr; 125 + if (le16_to_cpu(eh->eh_entries) == 0) 126 + /* empty leaf is found */ 127 + return -ENODATA; 128 + 124 129 /* leaf block */ 125 130 path[leaf_ppos].p_ext = *extent = 126 131 EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); 132 + path[leaf_ppos].p_block = 133 + ext_pblock(path[leaf_ppos].p_ext); 127 134 return 0; 128 135 } 129 136 } ··· 166 155 } 167 156 168 157 /** 169 - * mext_double_down_read - Acquire two inodes' read semaphore 158 + * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem 170 159 * 171 160 * @orig_inode: original inode structure 172 161 * @donor_inode: donor inode structure 173 - * Acquire read semaphore of the two inodes (orig and donor) by i_ino order. 162 + * Acquire write lock of i_data_sem of the two inodes (orig and donor) by 163 + * i_ino order. 174 164 */ 175 165 static void 176 - mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode) 177 - { 178 - struct inode *first = orig_inode, *second = donor_inode; 179 - 180 - /* 181 - * Use the inode number to provide the stable locking order instead 182 - * of its address, because the C language doesn't guarantee you can 183 - * compare pointers that don't come from the same array. 184 - */ 185 - if (donor_inode->i_ino < orig_inode->i_ino) { 186 - first = donor_inode; 187 - second = orig_inode; 188 - } 189 - 190 - down_read(&EXT4_I(first)->i_data_sem); 191 - down_read(&EXT4_I(second)->i_data_sem); 192 - } 193 - 194 - /** 195 - * mext_double_down_write - Acquire two inodes' write semaphore 196 - * 197 - * @orig_inode: original inode structure 198 - * @donor_inode: donor inode structure 199 - * Acquire write semaphore of the two inodes (orig and donor) by i_ino order. 200 - */ 201 - static void 202 - mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) 166 + double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) 203 167 { 204 168 struct inode *first = orig_inode, *second = donor_inode; 205 169 ··· 189 203 } 190 204 191 205 down_write(&EXT4_I(first)->i_data_sem); 192 - down_write(&EXT4_I(second)->i_data_sem); 206 + down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); 193 207 } 194 208 195 209 /** 196 - * mext_double_up_read - Release two inodes' read semaphore 210 + * double_up_write_data_sem - Release two inodes' write lock of i_data_sem 197 211 * 198 212 * @orig_inode: original inode structure to be released its lock first 199 213 * @donor_inode: donor inode structure to be released its lock second 200 - * Release read semaphore of two inodes (orig and donor). 214 + * Release write lock of i_data_sem of two inodes (orig and donor). 201 215 */ 202 216 static void 203 - mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) 204 - { 205 - up_read(&EXT4_I(orig_inode)->i_data_sem); 206 - up_read(&EXT4_I(donor_inode)->i_data_sem); 207 - } 208 - 209 - /** 210 - * mext_double_up_write - Release two inodes' write semaphore 211 - * 212 - * @orig_inode: original inode structure to be released its lock first 213 - * @donor_inode: donor inode structure to be released its lock second 214 - * Release write semaphore of two inodes (orig and donor). 215 - */ 216 - static void 217 - mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode) 217 + double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) 218 218 { 219 219 up_write(&EXT4_I(orig_inode)->i_data_sem); 220 220 up_write(&EXT4_I(donor_inode)->i_data_sem); ··· 568 596 * @tmp_oext: the extent that will belong to the donor inode 569 597 * @orig_off: block offset of original inode 570 598 * @donor_off: block offset of donor inode 571 - * @max_count: the maximun length of extents 599 + * @max_count: the maximum length of extents 572 600 * 573 601 * Return 0 on success, or a negative error value on failure. 574 602 */ ··· 633 661 * @donor_inode: donor inode 634 662 * @from: block offset of orig_inode 635 663 * @count: block count to be replaced 664 + * @err: pointer to save return value 636 665 * 637 666 * Replace original inode extents and donor inode extents page by page. 638 667 * We implement this replacement in the following three steps: ··· 644 671 * 3. Change the block information of donor inode to point at the saved 645 672 * original inode blocks in the dummy extents. 646 673 * 647 - * Return 0 on success, or a negative error value on failure. 674 + * Return replaced block count. 648 675 */ 649 676 static int 650 677 mext_replace_branches(handle_t *handle, struct inode *orig_inode, 651 678 struct inode *donor_inode, ext4_lblk_t from, 652 - ext4_lblk_t count) 679 + ext4_lblk_t count, int *err) 653 680 { 654 681 struct ext4_ext_path *orig_path = NULL; 655 682 struct ext4_ext_path *donor_path = NULL; 656 683 struct ext4_extent *oext, *dext; 657 684 struct ext4_extent tmp_dext, tmp_oext; 658 685 ext4_lblk_t orig_off = from, donor_off = from; 659 - int err = 0; 660 686 int depth; 661 687 int replaced_count = 0; 662 688 int dext_alen; 663 689 664 - mext_double_down_write(orig_inode, donor_inode); 690 + /* Protect extent trees against block allocations via delalloc */ 691 + double_down_write_data_sem(orig_inode, donor_inode); 665 692 666 693 /* Get the original extent for the block "orig_off" */ 667 - err = get_ext_path(orig_inode, orig_off, &orig_path); 668 - if (err) 694 + *err = get_ext_path(orig_inode, orig_off, &orig_path); 695 + if (*err) 669 696 goto out; 670 697 671 698 /* Get the donor extent for the head */ 672 - err = get_ext_path(donor_inode, donor_off, &donor_path); 673 - if (err) 699 + *err = get_ext_path(donor_inode, donor_off, &donor_path); 700 + if (*err) 674 701 goto out; 675 702 depth = ext_depth(orig_inode); 676 703 oext = orig_path[depth].p_ext; ··· 680 707 dext = donor_path[depth].p_ext; 681 708 tmp_dext = *dext; 682 709 683 - err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 710 + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 684 711 donor_off, count); 685 - if (err) 712 + if (*err) 686 713 goto out; 687 714 688 715 /* Loop for the donor extents */ ··· 691 718 if (!dext) { 692 719 ext4_error(donor_inode->i_sb, __func__, 693 720 "The extent for donor must be found"); 694 - err = -EIO; 721 + *err = -EIO; 695 722 goto out; 696 723 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { 697 724 ext4_error(donor_inode->i_sb, __func__, ··· 699 726 "extent(%u) should be equal", 700 727 donor_off, 701 728 le32_to_cpu(tmp_dext.ee_block)); 702 - err = -EIO; 729 + *err = -EIO; 703 730 goto out; 704 731 } 705 732 706 733 /* Set donor extent to orig extent */ 707 - err = mext_leaf_block(handle, orig_inode, 734 + *err = mext_leaf_block(handle, orig_inode, 708 735 orig_path, &tmp_dext, &orig_off); 709 - if (err < 0) 736 + if (*err) 710 737 goto out; 711 738 712 739 /* Set orig extent to donor extent */ 713 - err = mext_leaf_block(handle, donor_inode, 740 + *err = mext_leaf_block(handle, donor_inode, 714 741 donor_path, &tmp_oext, &donor_off); 715 - if (err < 0) 742 + if (*err) 716 743 goto out; 717 744 718 745 dext_alen = ext4_ext_get_actual_len(&tmp_dext); ··· 726 753 727 754 if (orig_path) 728 755 ext4_ext_drop_refs(orig_path); 729 - err = get_ext_path(orig_inode, orig_off, &orig_path); 730 - if (err) 756 + *err = get_ext_path(orig_inode, orig_off, &orig_path); 757 + if (*err) 731 758 goto out; 732 759 depth = ext_depth(orig_inode); 733 760 oext = orig_path[depth].p_ext; 734 - if (le32_to_cpu(oext->ee_block) + 735 - ext4_ext_get_actual_len(oext) <= orig_off) { 736 - err = 0; 737 - goto out; 738 - } 739 761 tmp_oext = *oext; 740 762 741 763 if (donor_path) 742 764 ext4_ext_drop_refs(donor_path); 743 - err = get_ext_path(donor_inode, donor_off, &donor_path); 744 - if (err) 765 + *err = get_ext_path(donor_inode, donor_off, &donor_path); 766 + if (*err) 745 767 goto out; 746 768 depth = ext_depth(donor_inode); 747 769 dext = donor_path[depth].p_ext; 748 - if (le32_to_cpu(dext->ee_block) + 749 - ext4_ext_get_actual_len(dext) <= donor_off) { 750 - err = 0; 751 - goto out; 752 - } 753 770 tmp_dext = *dext; 754 771 755 - err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 772 + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 756 773 donor_off, count - replaced_count); 757 - if (err) 774 + if (*err) 758 775 goto out; 759 776 } 760 777 ··· 758 795 kfree(donor_path); 759 796 } 760 797 761 - mext_double_up_write(orig_inode, donor_inode); 762 - return err; 798 + ext4_ext_invalidate_cache(orig_inode); 799 + ext4_ext_invalidate_cache(donor_inode); 800 + 801 + double_up_write_data_sem(orig_inode, donor_inode); 802 + 803 + return replaced_count; 763 804 } 764 805 765 806 /** ··· 775 808 * @data_offset_in_page: block index where data swapping starts 776 809 * @block_len_in_page: the number of blocks to be swapped 777 810 * @uninit: orig extent is uninitialized or not 811 + * @err: pointer to save return value 778 812 * 779 813 * Save the data in original inode blocks and replace original inode extents 780 814 * with donor inode extents by calling mext_replace_branches(). 781 - * Finally, write out the saved data in new original inode blocks. Return 0 782 - * on success, or a negative error value on failure. 815 + * Finally, write out the saved data in new original inode blocks. Return 816 + * replaced block count. 783 817 */ 784 818 static int 785 819 move_extent_per_page(struct file *o_filp, struct inode *donor_inode, 786 820 pgoff_t orig_page_offset, int data_offset_in_page, 787 - int block_len_in_page, int uninit) 821 + int block_len_in_page, int uninit, int *err) 788 822 { 789 823 struct inode *orig_inode = o_filp->f_dentry->d_inode; 790 824 struct address_space *mapping = orig_inode->i_mapping; ··· 797 829 long long offs = orig_page_offset << PAGE_CACHE_SHIFT; 798 830 unsigned long blocksize = orig_inode->i_sb->s_blocksize; 799 831 unsigned int w_flags = 0; 800 - unsigned int tmp_data_len, data_len; 832 + unsigned int tmp_data_size, data_size, replaced_size; 801 833 void *fsdata; 802 - int ret, i, jblocks; 834 + int i, jblocks; 835 + int err2 = 0; 836 + int replaced_count = 0; 803 837 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; 804 838 805 839 /* ··· 811 841 jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; 812 842 handle = ext4_journal_start(orig_inode, jblocks); 813 843 if (IS_ERR(handle)) { 814 - ret = PTR_ERR(handle); 815 - return ret; 844 + *err = PTR_ERR(handle); 845 + return 0; 816 846 } 817 847 818 848 if (segment_eq(get_fs(), KERNEL_DS)) ··· 828 858 * Just swap data blocks between orig and donor. 829 859 */ 830 860 if (uninit) { 831 - ret = mext_replace_branches(handle, orig_inode, 832 - donor_inode, orig_blk_offset, 833 - block_len_in_page); 834 - 835 - /* Clear the inode cache not to refer to the old data */ 836 - ext4_ext_invalidate_cache(orig_inode); 837 - ext4_ext_invalidate_cache(donor_inode); 861 + replaced_count = mext_replace_branches(handle, orig_inode, 862 + donor_inode, orig_blk_offset, 863 + block_len_in_page, err); 838 864 goto out2; 839 865 } 840 866 841 867 offs = (long long)orig_blk_offset << orig_inode->i_blkbits; 842 868 843 - /* Calculate data_len */ 869 + /* Calculate data_size */ 844 870 if ((orig_blk_offset + block_len_in_page - 1) == 845 871 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { 846 872 /* Replace the last block */ 847 - tmp_data_len = orig_inode->i_size & (blocksize - 1); 873 + tmp_data_size = orig_inode->i_size & (blocksize - 1); 848 874 /* 849 - * If data_len equal zero, it shows data_len is multiples of 875 + * If data_size equal zero, it shows data_size is multiples of 850 876 * blocksize. So we set appropriate value. 851 877 */ 852 - if (tmp_data_len == 0) 853 - tmp_data_len = blocksize; 878 + if (tmp_data_size == 0) 879 + tmp_data_size = blocksize; 854 880 855 - data_len = tmp_data_len + 881 + data_size = tmp_data_size + 856 882 ((block_len_in_page - 1) << orig_inode->i_blkbits); 857 - } else { 858 - data_len = block_len_in_page << orig_inode->i_blkbits; 859 - } 883 + } else 884 + data_size = block_len_in_page << orig_inode->i_blkbits; 860 885 861 - ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags, 886 + replaced_size = data_size; 887 + 888 + *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, 862 889 &page, &fsdata); 863 - if (unlikely(ret < 0)) 890 + if (unlikely(*err < 0)) 864 891 goto out; 865 892 866 893 if (!PageUptodate(page)) { ··· 878 911 /* Release old bh and drop refs */ 879 912 try_to_release_page(page, 0); 880 913 881 - ret = mext_replace_branches(handle, orig_inode, donor_inode, 882 - orig_blk_offset, block_len_in_page); 883 - if (ret < 0) 884 - goto out; 885 - 886 - /* Clear the inode cache not to refer to the old data */ 887 - ext4_ext_invalidate_cache(orig_inode); 888 - ext4_ext_invalidate_cache(donor_inode); 914 + replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, 915 + orig_blk_offset, block_len_in_page, 916 + &err2); 917 + if (err2) { 918 + if (replaced_count) { 919 + block_len_in_page = replaced_count; 920 + replaced_size = 921 + block_len_in_page << orig_inode->i_blkbits; 922 + } else 923 + goto out; 924 + } 889 925 890 926 if (!page_has_buffers(page)) 891 927 create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); ··· 898 928 bh = bh->b_this_page; 899 929 900 930 for (i = 0; i < block_len_in_page; i++) { 901 - ret = ext4_get_block(orig_inode, 931 + *err = ext4_get_block(orig_inode, 902 932 (sector_t)(orig_blk_offset + i), bh, 0); 903 - if (ret < 0) 933 + if (*err < 0) 904 934 goto out; 905 935 906 936 if (bh->b_this_page != NULL) 907 937 bh = bh->b_this_page; 908 938 } 909 939 910 - ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len, 940 + *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size, 911 941 page, fsdata); 912 942 page = NULL; 913 943 ··· 921 951 out2: 922 952 ext4_journal_stop(handle); 923 953 924 - return ret < 0 ? ret : 0; 954 + if (err2) 955 + *err = err2; 956 + 957 + return replaced_count; 925 958 } 926 959 927 960 /** ··· 935 962 * @orig_start: logical start offset in block for orig 936 963 * @donor_start: logical start offset in block for donor 937 964 * @len: the number of blocks to be moved 938 - * @moved_len: moved block length 939 965 * 940 966 * Check the arguments of ext4_move_extents() whether the files can be 941 967 * exchanged with each other. ··· 942 970 */ 943 971 static int 944 972 mext_check_arguments(struct inode *orig_inode, 945 - struct inode *donor_inode, __u64 orig_start, 946 - __u64 donor_start, __u64 *len, __u64 moved_len) 973 + struct inode *donor_inode, __u64 orig_start, 974 + __u64 donor_start, __u64 *len) 947 975 { 948 976 ext4_lblk_t orig_blocks, donor_blocks; 949 977 unsigned int blkbits = orig_inode->i_blkbits; ··· 954 982 ext4_debug("ext4 move extent: The argument files should be " 955 983 "regular file [ino:orig %lu, donor %lu]\n", 956 984 orig_inode->i_ino, donor_inode->i_ino); 985 + return -EINVAL; 986 + } 987 + 988 + if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { 989 + ext4_debug("ext4 move extent: suid or sgid is set" 990 + " to donor file [ino:orig %lu, donor %lu]\n", 991 + orig_inode->i_ino, donor_inode->i_ino); 957 992 return -EINVAL; 958 993 } 959 994 ··· 1001 1022 ext4_debug("ext4 move extent: orig and donor's start " 1002 1023 "offset are not same [ino:orig %lu, donor %lu]\n", 1003 1024 orig_inode->i_ino, donor_inode->i_ino); 1004 - return -EINVAL; 1005 - } 1006 - 1007 - if (moved_len) { 1008 - ext4_debug("ext4 move extent: moved_len should be 0 " 1009 - "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, 1010 - donor_inode->i_ino); 1011 1025 return -EINVAL; 1012 1026 } 1013 1027 ··· 1060 1088 } 1061 1089 1062 1090 if (!*len) { 1063 - ext4_debug("ext4 move extent: len shoudld not be 0 " 1091 + ext4_debug("ext4 move extent: len should not be 0 " 1064 1092 "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, 1065 1093 donor_inode->i_ino); 1066 1094 return -EINVAL; ··· 1204 1232 return -EINVAL; 1205 1233 } 1206 1234 1207 - /* protect orig and donor against a truncate */ 1235 + /* Protect orig and donor inodes against a truncate */ 1208 1236 ret1 = mext_inode_double_lock(orig_inode, donor_inode); 1209 1237 if (ret1 < 0) 1210 1238 return ret1; 1211 1239 1212 - mext_double_down_read(orig_inode, donor_inode); 1240 + /* Protect extent tree against block allocations via delalloc */ 1241 + double_down_write_data_sem(orig_inode, donor_inode); 1213 1242 /* Check the filesystem environment whether move_extent can be done */ 1214 1243 ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, 1215 - donor_start, &len, *moved_len); 1216 - mext_double_up_read(orig_inode, donor_inode); 1244 + donor_start, &len); 1217 1245 if (ret1) 1218 1246 goto out; 1219 1247 ··· 1327 1355 seq_start = le32_to_cpu(ext_cur->ee_block); 1328 1356 rest_blocks = seq_blocks; 1329 1357 1330 - /* Discard preallocations of two inodes */ 1331 - down_write(&EXT4_I(orig_inode)->i_data_sem); 1332 - ext4_discard_preallocations(orig_inode); 1333 - up_write(&EXT4_I(orig_inode)->i_data_sem); 1334 - 1335 - down_write(&EXT4_I(donor_inode)->i_data_sem); 1336 - ext4_discard_preallocations(donor_inode); 1337 - up_write(&EXT4_I(donor_inode)->i_data_sem); 1358 + /* 1359 + * Up semaphore to avoid following problems: 1360 + * a. transaction deadlock among ext4_journal_start, 1361 + * ->write_begin via pagefault, and jbd2_journal_commit 1362 + * b. racing with ->readpage, ->write_begin, and ext4_get_block 1363 + * in move_extent_per_page 1364 + */ 1365 + double_up_write_data_sem(orig_inode, donor_inode); 1338 1366 1339 1367 while (orig_page_offset <= seq_end_page) { 1340 1368 1341 1369 /* Swap original branches with new branches */ 1342 - ret1 = move_extent_per_page(o_filp, donor_inode, 1370 + block_len_in_page = move_extent_per_page( 1371 + o_filp, donor_inode, 1343 1372 orig_page_offset, 1344 1373 data_offset_in_page, 1345 - block_len_in_page, uninit); 1346 - if (ret1 < 0) 1347 - goto out; 1348 - orig_page_offset++; 1374 + block_len_in_page, uninit, 1375 + &ret1); 1376 + 1349 1377 /* Count how many blocks we have exchanged */ 1350 1378 *moved_len += block_len_in_page; 1379 + if (ret1 < 0) 1380 + break; 1351 1381 if (*moved_len > len) { 1352 1382 ext4_error(orig_inode->i_sb, __func__, 1353 1383 "We replaced blocks too much! " 1354 1384 "sum of replaced: %llu requested: %llu", 1355 1385 *moved_len, len); 1356 1386 ret1 = -EIO; 1357 - goto out; 1387 + break; 1358 1388 } 1359 1389 1390 + orig_page_offset++; 1360 1391 data_offset_in_page = 0; 1361 1392 rest_blocks -= block_len_in_page; 1362 1393 if (rest_blocks > blocks_per_page) ··· 1367 1392 else 1368 1393 block_len_in_page = rest_blocks; 1369 1394 } 1395 + 1396 + double_down_write_data_sem(orig_inode, donor_inode); 1397 + if (ret1 < 0) 1398 + break; 1370 1399 1371 1400 /* Decrease buffer counter */ 1372 1401 if (holecheck_path) ··· 1393 1414 1394 1415 } 1395 1416 out: 1417 + if (*moved_len) { 1418 + ext4_discard_preallocations(orig_inode); 1419 + ext4_discard_preallocations(donor_inode); 1420 + } 1421 + 1396 1422 if (orig_path) { 1397 1423 ext4_ext_drop_refs(orig_path); 1398 1424 kfree(orig_path); ··· 1406 1422 ext4_ext_drop_refs(holecheck_path); 1407 1423 kfree(holecheck_path); 1408 1424 } 1409 - 1425 + double_up_write_data_sem(orig_inode, donor_inode); 1410 1426 ret2 = mext_inode_double_unlock(orig_inode, donor_inode); 1411 1427 1412 1428 if (ret1)

+16 -22

fs/ext4/namei.c

··· 1292 1292 * add_dirent_to_buf will attempt search the directory block for 1293 1293 * space. It will return -ENOSPC if no space is available, and -EIO 1294 1294 * and -EEXIST if directory entry already exists. 1295 - * 1296 - * NOTE! bh is NOT released in the case where ENOSPC is returned. In 1297 - * all other cases bh is released. 1298 1295 */ 1299 1296 static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, 1300 1297 struct inode *inode, struct ext4_dir_entry_2 *de, ··· 1312 1315 top = bh->b_data + blocksize - reclen; 1313 1316 while ((char *) de <= top) { 1314 1317 if (!ext4_check_dir_entry("ext4_add_entry", dir, de, 1315 - bh, offset)) { 1316 - brelse(bh); 1318 + bh, offset)) 1317 1319 return -EIO; 1318 - } 1319 - if (ext4_match(namelen, name, de)) { 1320 - brelse(bh); 1320 + if (ext4_match(namelen, name, de)) 1321 1321 return -EEXIST; 1322 - } 1323 1322 nlen = EXT4_DIR_REC_LEN(de->name_len); 1324 1323 rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); 1325 1324 if ((de->inode? rlen - nlen: rlen) >= reclen) ··· 1330 1337 err = ext4_journal_get_write_access(handle, bh); 1331 1338 if (err) { 1332 1339 ext4_std_error(dir->i_sb, err); 1333 - brelse(bh); 1334 1340 return err; 1335 1341 } 1336 1342 ··· 1369 1377 err = ext4_handle_dirty_metadata(handle, dir, bh); 1370 1378 if (err) 1371 1379 ext4_std_error(dir->i_sb, err); 1372 - brelse(bh); 1373 1380 return 0; 1374 1381 } 1375 1382 ··· 1462 1471 if (!(de)) 1463 1472 return retval; 1464 1473 1465 - return add_dirent_to_buf(handle, dentry, inode, de, bh); 1474 + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); 1475 + brelse(bh); 1476 + return retval; 1466 1477 } 1467 1478 1468 1479 /* ··· 1507 1514 if(!bh) 1508 1515 return retval; 1509 1516 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); 1510 - if (retval != -ENOSPC) 1517 + if (retval != -ENOSPC) { 1518 + brelse(bh); 1511 1519 return retval; 1520 + } 1512 1521 1513 1522 if (blocks == 1 && !dx_fallback && 1514 1523 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) ··· 1523 1528 de = (struct ext4_dir_entry_2 *) bh->b_data; 1524 1529 de->inode = 0; 1525 1530 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); 1526 - return add_dirent_to_buf(handle, dentry, inode, de, bh); 1531 + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); 1532 + brelse(bh); 1533 + return retval; 1527 1534 } 1528 1535 1529 1536 /* ··· 1558 1561 goto journal_error; 1559 1562 1560 1563 err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); 1561 - if (err != -ENOSPC) { 1562 - bh = NULL; 1564 + if (err != -ENOSPC) 1563 1565 goto cleanup; 1564 - } 1565 1566 1566 1567 /* Block full, should compress but for now just split */ 1567 1568 dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", ··· 1652 1657 if (!de) 1653 1658 goto cleanup; 1654 1659 err = add_dirent_to_buf(handle, dentry, inode, de, bh); 1655 - bh = NULL; 1656 1660 goto cleanup; 1657 1661 1658 1662 journal_error: ··· 1769 1775 retry: 1770 1776 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 1771 1777 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1772 - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); 1778 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); 1773 1779 if (IS_ERR(handle)) 1774 1780 return PTR_ERR(handle); 1775 1781 ··· 1803 1809 retry: 1804 1810 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 1805 1811 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1806 - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); 1812 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); 1807 1813 if (IS_ERR(handle)) 1808 1814 return PTR_ERR(handle); 1809 1815 ··· 1840 1846 retry: 1841 1847 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 1842 1848 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1843 - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); 1849 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); 1844 1850 if (IS_ERR(handle)) 1845 1851 return PTR_ERR(handle); 1846 1852 ··· 2253 2259 retry: 2254 2260 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2255 2261 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + 2256 - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); 2262 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); 2257 2263 if (IS_ERR(handle)) 2258 2264 return PTR_ERR(handle); 2259 2265

+1 -1

fs/ext4/resize.c

··· 247 247 goto exit_bh; 248 248 249 249 if (IS_ERR(gdb = bclean(handle, sb, block))) { 250 - err = PTR_ERR(bh); 250 + err = PTR_ERR(gdb); 251 251 goto exit_bh; 252 252 } 253 253 ext4_handle_dirty_metadata(handle, NULL, gdb);

+87 -31

fs/ext4/super.c

··· 603 603 if (sb->s_dirt) 604 604 ext4_commit_super(sb, 1); 605 605 606 - ext4_release_system_zone(sb); 607 - ext4_mb_release(sb); 608 - ext4_ext_release(sb); 609 - ext4_xattr_put_super(sb); 610 606 if (sbi->s_journal) { 611 607 err = jbd2_journal_destroy(sbi->s_journal); 612 608 sbi->s_journal = NULL; ··· 610 614 ext4_abort(sb, __func__, 611 615 "Couldn't clean up the journal"); 612 616 } 617 + 618 + ext4_release_system_zone(sb); 619 + ext4_mb_release(sb); 620 + ext4_ext_release(sb); 621 + ext4_xattr_put_super(sb); 622 + 613 623 if (!(sb->s_flags & MS_RDONLY)) { 614 624 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 615 625 es->s_state = cpu_to_le16(sbi->s_mount_state); ··· 706 704 spin_lock_init(&(ei->i_block_reservation_lock)); 707 705 INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); 708 706 ei->cur_aio_dio = NULL; 707 + ei->i_sync_tid = 0; 708 + ei->i_datasync_tid = 0; 709 709 710 710 return &ei->vfs_inode; 711 711 } ··· 903 899 if (test_opt(sb, NO_AUTO_DA_ALLOC)) 904 900 seq_puts(seq, ",noauto_da_alloc"); 905 901 902 + if (test_opt(sb, DISCARD)) 903 + seq_puts(seq, ",discard"); 904 + 905 + if (test_opt(sb, NOLOAD)) 906 + seq_puts(seq, ",norecovery"); 907 + 906 908 ext4_show_quota_options(seq, sb); 907 909 908 910 return 0; ··· 1089 1079 Opt_usrquota, Opt_grpquota, Opt_i_version, 1090 1080 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1091 1081 Opt_block_validity, Opt_noblock_validity, 1092 - Opt_inode_readahead_blks, Opt_journal_ioprio 1082 + Opt_inode_readahead_blks, Opt_journal_ioprio, 1083 + Opt_discard, Opt_nodiscard, 1093 1084 }; 1094 1085 1095 1086 static const match_table_t tokens = { ··· 1115 1104 {Opt_acl, "acl"}, 1116 1105 {Opt_noacl, "noacl"}, 1117 1106 {Opt_noload, "noload"}, 1107 + {Opt_noload, "norecovery"}, 1118 1108 {Opt_nobh, "nobh"}, 1119 1109 {Opt_bh, "bh"}, 1120 1110 {Opt_commit, "commit=%u"}, ··· 1156 1144 {Opt_auto_da_alloc, "auto_da_alloc=%u"}, 1157 1145 {Opt_auto_da_alloc, "auto_da_alloc"}, 1158 1146 {Opt_noauto_da_alloc, "noauto_da_alloc"}, 1147 + {Opt_discard, "discard"}, 1148 + {Opt_nodiscard, "nodiscard"}, 1159 1149 {Opt_err, NULL}, 1160 1150 }; 1161 1151 ··· 1579 1565 else 1580 1566 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); 1581 1567 break; 1568 + case Opt_discard: 1569 + set_opt(sbi->s_mount_opt, DISCARD); 1570 + break; 1571 + case Opt_nodiscard: 1572 + clear_opt(sbi->s_mount_opt, DISCARD); 1573 + break; 1582 1574 default: 1583 1575 ext4_msg(sb, KERN_ERR, 1584 1576 "Unrecognized mount option \"%s\" " ··· 1693 1673 size_t size; 1694 1674 int i; 1695 1675 1696 - if (!sbi->s_es->s_log_groups_per_flex) { 1676 + sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; 1677 + groups_per_flex = 1 << sbi->s_log_groups_per_flex; 1678 + 1679 + if (groups_per_flex < 2) { 1697 1680 sbi->s_log_groups_per_flex = 0; 1698 1681 return 1; 1699 1682 } 1700 - 1701 - sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; 1702 - groups_per_flex = 1 << sbi->s_log_groups_per_flex; 1703 1683 1704 1684 /* We allocate both existing and potentially added groups */ 1705 1685 flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + ··· 2741 2721 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { 2742 2722 if (ext4_load_journal(sb, es, journal_devnum)) 2743 2723 goto failed_mount3; 2744 - if (!(sb->s_flags & MS_RDONLY) && 2745 - EXT4_SB(sb)->s_journal->j_failed_commit) { 2746 - ext4_msg(sb, KERN_CRIT, "error: " 2747 - "ext4_fill_super: Journal transaction " 2748 - "%u is corrupt", 2749 - EXT4_SB(sb)->s_journal->j_failed_commit); 2750 - if (test_opt(sb, ERRORS_RO)) { 2751 - ext4_msg(sb, KERN_CRIT, 2752 - "Mounting filesystem read-only"); 2753 - sb->s_flags |= MS_RDONLY; 2754 - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 2755 - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 2756 - } 2757 - if (test_opt(sb, ERRORS_PANIC)) { 2758 - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 2759 - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 2760 - ext4_commit_super(sb, 1); 2761 - goto failed_mount4; 2762 - } 2763 - } 2764 2724 } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && 2765 2725 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { 2766 2726 ext4_msg(sb, KERN_ERR, "required journal recovery " ··· 3668 3668 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; 3669 3669 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - 3670 3670 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); 3671 - ext4_free_blocks_count_set(es, buf->f_bfree); 3672 3671 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); 3673 3672 if (buf->f_bfree < ext4_r_blocks_count(es)) 3674 3673 buf->f_bavail = 0; 3675 3674 buf->f_files = le32_to_cpu(es->s_inodes_count); 3676 3675 buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); 3677 - es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); 3678 3676 buf->f_namelen = EXT4_NAME_LEN; 3679 3677 fsid = le64_to_cpup((void *)es->s_uuid) ^ 3680 3678 le64_to_cpup((void *)es->s_uuid + sizeof(u64)); ··· 3964 3966 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); 3965 3967 } 3966 3968 3969 + #if !defined(CONTIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 3970 + static struct file_system_type ext2_fs_type = { 3971 + .owner = THIS_MODULE, 3972 + .name = "ext2", 3973 + .get_sb = ext4_get_sb, 3974 + .kill_sb = kill_block_super, 3975 + .fs_flags = FS_REQUIRES_DEV, 3976 + }; 3977 + 3978 + static inline void register_as_ext2(void) 3979 + { 3980 + int err = register_filesystem(&ext2_fs_type); 3981 + if (err) 3982 + printk(KERN_WARNING 3983 + "EXT4-fs: Unable to register as ext2 (%d)\n", err); 3984 + } 3985 + 3986 + static inline void unregister_as_ext2(void) 3987 + { 3988 + unregister_filesystem(&ext2_fs_type); 3989 + } 3990 + #else 3991 + static inline void register_as_ext2(void) { } 3992 + static inline void unregister_as_ext2(void) { } 3993 + #endif 3994 + 3995 + #if !defined(CONTIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 3996 + static struct file_system_type ext3_fs_type = { 3997 + .owner = THIS_MODULE, 3998 + .name = "ext3", 3999 + .get_sb = ext4_get_sb, 4000 + .kill_sb = kill_block_super, 4001 + .fs_flags = FS_REQUIRES_DEV, 4002 + }; 4003 + 4004 + static inline void register_as_ext3(void) 4005 + { 4006 + int err = register_filesystem(&ext3_fs_type); 4007 + if (err) 4008 + printk(KERN_WARNING 4009 + "EXT4-fs: Unable to register as ext3 (%d)\n", err); 4010 + } 4011 + 4012 + static inline void unregister_as_ext3(void) 4013 + { 4014 + unregister_filesystem(&ext3_fs_type); 4015 + } 4016 + #else 4017 + static inline void register_as_ext3(void) { } 4018 + static inline void unregister_as_ext3(void) { } 4019 + #endif 4020 + 3967 4021 static struct file_system_type ext4_fs_type = { 3968 4022 .owner = THIS_MODULE, 3969 4023 .name = "ext4", ··· 4045 3995 err = init_inodecache(); 4046 3996 if (err) 4047 3997 goto out1; 3998 + register_as_ext2(); 3999 + register_as_ext3(); 4048 4000 err = register_filesystem(&ext4_fs_type); 4049 4001 if (err) 4050 4002 goto out; 4051 4003 return 0; 4052 4004 out: 4005 + unregister_as_ext2(); 4006 + unregister_as_ext3(); 4053 4007 destroy_inodecache(); 4054 4008 out1: 4055 4009 exit_ext4_xattr(); ··· 4069 4015 4070 4016 static void __exit exit_ext4_fs(void) 4071 4017 { 4018 + unregister_as_ext2(); 4019 + unregister_as_ext3(); 4072 4020 unregister_filesystem(&ext4_fs_type); 4073 4021 destroy_inodecache(); 4074 4022 exit_ext4_xattr();

+9 -6

fs/ext4/xattr.c

··· 482 482 ea_bdebug(bh, "refcount now=0; freeing"); 483 483 if (ce) 484 484 mb_cache_entry_free(ce); 485 - ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1); 486 485 get_bh(bh); 487 - ext4_forget(handle, 1, inode, bh, bh->b_blocknr); 486 + ext4_free_blocks(handle, inode, bh, 0, 1, 487 + EXT4_FREE_BLOCKS_METADATA | 488 + EXT4_FREE_BLOCKS_FORGET); 488 489 } else { 489 490 le32_add_cpu(&BHDR(bh)->h_refcount, -1); 490 491 error = ext4_handle_dirty_metadata(handle, inode, bh); ··· 833 832 new_bh = sb_getblk(sb, block); 834 833 if (!new_bh) { 835 834 getblk_failed: 836 - ext4_free_blocks(handle, inode, block, 1, 1); 835 + ext4_free_blocks(handle, inode, 0, block, 1, 836 + EXT4_FREE_BLOCKS_METADATA); 837 837 error = -EIO; 838 838 goto cleanup; 839 839 } ··· 990 988 if (error) 991 989 goto cleanup; 992 990 991 + error = ext4_journal_get_write_access(handle, is.iloc.bh); 992 + if (error) 993 + goto cleanup; 994 + 993 995 if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { 994 996 struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); 995 997 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); ··· 1019 1013 if (flags & XATTR_CREATE) 1020 1014 goto cleanup; 1021 1015 } 1022 - error = ext4_journal_get_write_access(handle, is.iloc.bh); 1023 - if (error) 1024 - goto cleanup; 1025 1016 if (!value) { 1026 1017 if (!is.s.not_found) 1027 1018 error = ext4_xattr_ibody_set(handle, inode, &i, &is);

+4

fs/jbd2/commit.c

··· 636 636 JBUFFER_TRACE(jh, "ph3: write metadata"); 637 637 flags = jbd2_journal_write_metadata_buffer(commit_transaction, 638 638 jh, &new_jh, blocknr); 639 + if (flags < 0) { 640 + jbd2_journal_abort(journal, flags); 641 + continue; 642 + } 639 643 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); 640 644 wbuf[bufs++] = jh2bh(new_jh); 641 645

+12

fs/jbd2/journal.c

··· 78 78 EXPORT_SYMBOL(jbd2_journal_ack_err); 79 79 EXPORT_SYMBOL(jbd2_journal_clear_err); 80 80 EXPORT_SYMBOL(jbd2_log_wait_commit); 81 + EXPORT_SYMBOL(jbd2_log_start_commit); 81 82 EXPORT_SYMBOL(jbd2_journal_start_commit); 82 83 EXPORT_SYMBOL(jbd2_journal_force_commit_nested); 83 84 EXPORT_SYMBOL(jbd2_journal_wipe); ··· 359 358 360 359 jbd_unlock_bh_state(bh_in); 361 360 tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); 361 + if (!tmp) { 362 + jbd2_journal_put_journal_head(new_jh); 363 + return -ENOMEM; 364 + } 362 365 jbd_lock_bh_state(bh_in); 363 366 if (jh_in->b_frozen_data) { 364 367 jbd2_free(tmp, bh_in->b_size); ··· 1252 1247 * data from the journal. */ 1253 1248 if (jbd2_journal_recover(journal)) 1254 1249 goto recovery_error; 1250 + 1251 + if (journal->j_failed_commit) { 1252 + printk(KERN_ERR "JBD2: journal transaction %u on %s " 1253 + "is corrupt.\n", journal->j_failed_commit, 1254 + journal->j_devname); 1255 + return -EIO; 1256 + } 1255 1257 1256 1258 /* OK, we've finished with the dynamic journal bits: 1257 1259 * reinitialise the dynamic contents of the superblock in memory

+40 -14

include/trace/events/ext4.h

··· 38 38 __entry->blocks = inode->i_blocks; 39 39 ), 40 40 41 - TP_printk("dev %s ino %lu mode %d uid %u gid %u blocks %llu", 41 + TP_printk("dev %s ino %lu mode 0%o uid %u gid %u blocks %llu", 42 42 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, 43 43 __entry->mode, __entry->uid, __entry->gid, 44 44 (unsigned long long) __entry->blocks) ··· 61 61 __entry->mode = mode; 62 62 ), 63 63 64 - TP_printk("dev %s dir %lu mode %d", 64 + TP_printk("dev %s dir %lu mode 0%o", 65 65 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->dir, 66 66 __entry->mode) 67 67 ); ··· 85 85 __entry->mode = mode; 86 86 ), 87 87 88 - TP_printk("dev %s ino %lu dir %lu mode %d", 88 + TP_printk("dev %s ino %lu dir %lu mode 0%o", 89 89 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, 90 90 (unsigned long) __entry->dir, __entry->mode) 91 91 ); ··· 305 305 __field( int, ret ) 306 306 __field( int, pages_written ) 307 307 __field( long, pages_skipped ) 308 - __field( char, encountered_congestion ) 309 308 __field( char, more_io ) 310 309 __field( char, no_nrwrite_index_update ) 311 310 __field( pgoff_t, writeback_index ) ··· 316 317 __entry->ret = ret; 317 318 __entry->pages_written = pages_written; 318 319 __entry->pages_skipped = wbc->pages_skipped; 319 - __entry->encountered_congestion = wbc->encountered_congestion; 320 320 __entry->more_io = wbc->more_io; 321 321 __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update; 322 322 __entry->writeback_index = inode->i_mapping->writeback_index; 323 323 ), 324 324 325 - TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d writeback_index %lu", 325 + TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld more_io %d no_nrwrite_index_update %d writeback_index %lu", 326 326 jbd2_dev_to_name(__entry->dev), 327 327 (unsigned long) __entry->ino, __entry->ret, 328 328 __entry->pages_written, __entry->pages_skipped, 329 - __entry->encountered_congestion, __entry->more_io, 329 + __entry->more_io, 330 330 __entry->no_nrwrite_index_update, 331 331 (unsigned long) __entry->writeback_index) 332 332 ); ··· 589 591 590 592 TRACE_EVENT(ext4_free_blocks, 591 593 TP_PROTO(struct inode *inode, __u64 block, unsigned long count, 592 - int metadata), 594 + int flags), 593 595 594 - TP_ARGS(inode, block, count, metadata), 596 + TP_ARGS(inode, block, count, flags), 595 597 596 598 TP_STRUCT__entry( 597 599 __field( dev_t, dev ) 598 600 __field( ino_t, ino ) 601 + __field( umode_t, mode ) 599 602 __field( __u64, block ) 600 603 __field( unsigned long, count ) 601 - __field( int, metadata ) 602 - 604 + __field( int, flags ) 603 605 ), 604 606 605 607 TP_fast_assign( 606 608 __entry->dev = inode->i_sb->s_dev; 607 609 __entry->ino = inode->i_ino; 610 + __entry->mode = inode->i_mode; 608 611 __entry->block = block; 609 612 __entry->count = count; 610 - __entry->metadata = metadata; 613 + __entry->flags = flags; 611 614 ), 612 615 613 - TP_printk("dev %s ino %lu block %llu count %lu metadata %d", 616 + TP_printk("dev %s ino %lu mode 0%o block %llu count %lu flags %d", 614 617 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, 615 - __entry->block, __entry->count, __entry->metadata) 618 + __entry->mode, __entry->block, __entry->count, 619 + __entry->flags) 616 620 ); 617 621 618 622 TRACE_EVENT(ext4_sync_file, ··· 846 846 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, 847 847 __entry->result_group, __entry->result_start, 848 848 __entry->result_len, __entry->result_logical) 849 + ); 850 + 851 + TRACE_EVENT(ext4_forget, 852 + TP_PROTO(struct inode *inode, int is_metadata, __u64 block), 853 + 854 + TP_ARGS(inode, is_metadata, block), 855 + 856 + TP_STRUCT__entry( 857 + __field( dev_t, dev ) 858 + __field( ino_t, ino ) 859 + __field( umode_t, mode ) 860 + __field( int, is_metadata ) 861 + __field( __u64, block ) 862 + ), 863 + 864 + TP_fast_assign( 865 + __entry->dev = inode->i_sb->s_dev; 866 + __entry->ino = inode->i_ino; 867 + __entry->mode = inode->i_mode; 868 + __entry->is_metadata = is_metadata; 869 + __entry->block = block; 870 + ), 871 + 872 + TP_printk("dev %s ino %lu mode 0%o is_metadata %d block %llu", 873 + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, 874 + __entry->mode, __entry->is_metadata, __entry->block) 849 875 ); 850 876 851 877 #endif /* _TRACE_EXT4_H */