commit d7f5f1bd3c240c4d527c0871a38dc3d61255ea9e · tjh.dev/kernel

+26 -12

fs/ext4/balloc.c

··· 626 626 627 627 /** 628 628 * ext4_should_retry_alloc() - check if a block allocation should be retried 629 - * @sb: super block 630 - * @retries: number of attemps has been made 629 + * @sb: superblock 630 + * @retries: number of retry attempts made so far 631 631 * 632 - * ext4_should_retry_alloc() is called when ENOSPC is returned, and if 633 - * it is profitable to retry the operation, this function will wait 634 - * for the current or committing transaction to complete, and then 635 - * return TRUE. We will only retry once. 632 + * ext4_should_retry_alloc() is called when ENOSPC is returned while 633 + * attempting to allocate blocks. If there's an indication that a pending 634 + * journal transaction might free some space and allow another attempt to 635 + * succeed, this function will wait for the current or committing transaction 636 + * to complete and then return TRUE. 636 637 */ 637 638 int ext4_should_retry_alloc(struct super_block *sb, int *retries) 638 639 { 639 - if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) || 640 - (*retries)++ > 1 || 641 - !EXT4_SB(sb)->s_journal) 640 + struct ext4_sb_info *sbi = EXT4_SB(sb); 641 + 642 + if (!sbi->s_journal) 642 643 return 0; 643 644 645 + if (++(*retries) > 3) { 646 + percpu_counter_inc(&sbi->s_sra_exceeded_retry_limit); 647 + return 0; 648 + } 649 + 650 + /* 651 + * if there's no indication that blocks are about to be freed it's 652 + * possible we just missed a transaction commit that did so 653 + */ 644 654 smp_mb(); 645 - if (EXT4_SB(sb)->s_mb_free_pending == 0) 646 - return 0; 655 + if (sbi->s_mb_free_pending == 0) 656 + return ext4_has_free_clusters(sbi, 1, 0); 647 657 658 + /* 659 + * it's possible we've just missed a transaction commit here, 660 + * so ignore the returned status 661 + */ 648 662 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); 649 - jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); 663 + (void) jbd2_journal_force_commit_nested(sbi->s_journal); 650 664 return 1; 651 665 } 652 666

+3

fs/ext4/ext4.h

··· 1484 1484 struct percpu_counter s_freeinodes_counter; 1485 1485 struct percpu_counter s_dirs_counter; 1486 1486 struct percpu_counter s_dirtyclusters_counter; 1487 + struct percpu_counter s_sra_exceeded_retry_limit; 1487 1488 struct blockgroup_lock *s_blockgroup_lock; 1488 1489 struct proc_dir_entry *s_proc; 1489 1490 struct kobject s_kobj; ··· 2794 2793 struct dentry *dentry); 2795 2794 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry); 2796 2795 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry); 2796 + void __ext4_fc_track_create(handle_t *handle, struct inode *inode, 2797 + struct dentry *dentry); 2797 2798 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); 2798 2799 void ext4_fc_track_inode(handle_t *handle, struct inode *inode); 2799 2800 void ext4_fc_mark_ineligible(struct super_block *sb, int reason);

+1 -1

fs/ext4/extents.c

··· 4382 4382 { 4383 4383 struct inode *inode = file_inode(file); 4384 4384 handle_t *handle; 4385 - int ret, ret2 = 0, ret3 = 0; 4385 + int ret = 0, ret2 = 0, ret3 = 0; 4386 4386 int retries = 0; 4387 4387 int depth = 0; 4388 4388 struct ext4_map_blocks map;

+7 -2

fs/ext4/fast_commit.c

··· 513 513 __ext4_fc_track_link(handle, d_inode(dentry), dentry); 514 514 } 515 515 516 - void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) 516 + void __ext4_fc_track_create(handle_t *handle, struct inode *inode, 517 + struct dentry *dentry) 517 518 { 518 519 struct __track_dentry_update_args args; 519 - struct inode *inode = d_inode(dentry); 520 520 int ret; 521 521 522 522 args.dentry = dentry; ··· 525 525 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 526 526 (void *)&args, 0); 527 527 trace_ext4_fc_track_create(inode, dentry, ret); 528 + } 529 + 530 + void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) 531 + { 532 + __ext4_fc_track_create(handle, d_inode(dentry), dentry); 528 533 } 529 534 530 535 /* __track_fn for inode tracking */

+10 -8

fs/ext4/inode.c

··· 1938 1938 if (!ret) 1939 1939 ret = err; 1940 1940 1941 - if (!ext4_has_inline_data(inode)) 1942 - ext4_walk_page_buffers(NULL, page_bufs, 0, len, 1943 - NULL, bput_one); 1944 1941 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 1945 1942 out: 1946 1943 unlock_page(page); 1947 1944 out_no_pagelock: 1945 + if (!inline_data && page_bufs) 1946 + ext4_walk_page_buffers(NULL, page_bufs, 0, len, 1947 + NULL, bput_one); 1948 1948 brelse(inode_bh); 1949 1949 return ret; 1950 1950 } ··· 5026 5026 struct ext4_inode_info *ei = EXT4_I(inode); 5027 5027 struct buffer_head *bh = iloc->bh; 5028 5028 struct super_block *sb = inode->i_sb; 5029 - int err = 0, rc, block; 5029 + int err = 0, block; 5030 5030 int need_datasync = 0, set_large_file = 0; 5031 5031 uid_t i_uid; 5032 5032 gid_t i_gid; ··· 5138 5138 bh->b_data); 5139 5139 5140 5140 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 5141 - rc = ext4_handle_dirty_metadata(handle, NULL, bh); 5142 - if (!err) 5143 - err = rc; 5141 + err = ext4_handle_dirty_metadata(handle, NULL, bh); 5142 + if (err) 5143 + goto out_brelse; 5144 5144 ext4_clear_inode_state(inode, EXT4_STATE_NEW); 5145 5145 if (set_large_file) { 5146 5146 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); ··· 5387 5387 inode->i_gid = attr->ia_gid; 5388 5388 error = ext4_mark_inode_dirty(handle, inode); 5389 5389 ext4_journal_stop(handle); 5390 - if (unlikely(error)) 5390 + if (unlikely(error)) { 5391 + ext4_fc_stop_update(inode); 5391 5392 return error; 5393 + } 5392 5394 } 5393 5395 5394 5396 if (attr->ia_valid & ATTR_SIZE) {

+9 -2

fs/ext4/mballoc.c

··· 2709 2709 } 2710 2710 2711 2711 if (ext4_has_feature_flex_bg(sb)) { 2712 - /* a single flex group is supposed to be read by a single IO */ 2713 - sbi->s_mb_prefetch = min(1 << sbi->s_es->s_log_groups_per_flex, 2712 + /* a single flex group is supposed to be read by a single IO. 2713 + * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is 2714 + * unsigned integer, so the maximum shift is 32. 2715 + */ 2716 + if (sbi->s_es->s_log_groups_per_flex >= 32) { 2717 + ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group"); 2718 + goto err_freesgi; 2719 + } 2720 + sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex, 2714 2721 BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9)); 2715 2722 sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ 2716 2723 } else {

+39 -11

fs/ext4/namei.c

··· 3613 3613 return retval; 3614 3614 } 3615 3615 3616 + static void ext4_resetent(handle_t *handle, struct ext4_renament *ent, 3617 + unsigned ino, unsigned file_type) 3618 + { 3619 + struct ext4_renament old = *ent; 3620 + int retval = 0; 3621 + 3622 + /* 3623 + * old->de could have moved from under us during make indexed dir, 3624 + * so the old->de may no longer valid and need to find it again 3625 + * before reset old inode info. 3626 + */ 3627 + old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL); 3628 + if (IS_ERR(old.bh)) 3629 + retval = PTR_ERR(old.bh); 3630 + if (!old.bh) 3631 + retval = -ENOENT; 3632 + if (retval) { 3633 + ext4_std_error(old.dir->i_sb, retval); 3634 + return; 3635 + } 3636 + 3637 + ext4_setent(handle, &old, ino, file_type); 3638 + brelse(old.bh); 3639 + } 3640 + 3616 3641 static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, 3617 3642 const struct qstr *d_name) 3618 3643 { ··· 3799 3774 */ 3800 3775 retval = -ENOENT; 3801 3776 if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) 3802 - goto end_rename; 3777 + goto release_bh; 3803 3778 3804 3779 new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, 3805 3780 &new.de, &new.inlined); 3806 3781 if (IS_ERR(new.bh)) { 3807 3782 retval = PTR_ERR(new.bh); 3808 3783 new.bh = NULL; 3809 - goto end_rename; 3784 + goto release_bh; 3810 3785 } 3811 3786 if (new.bh) { 3812 3787 if (!new.inode) { ··· 3823 3798 handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits); 3824 3799 if (IS_ERR(handle)) { 3825 3800 retval = PTR_ERR(handle); 3826 - handle = NULL; 3827 - goto end_rename; 3801 + goto release_bh; 3828 3802 } 3829 3803 } else { 3830 3804 whiteout = ext4_whiteout_for_rename(mnt_userns, &old, credits, &handle); 3831 3805 if (IS_ERR(whiteout)) { 3832 3806 retval = PTR_ERR(whiteout); 3833 - whiteout = NULL; 3834 - goto end_rename; 3807 + goto release_bh; 3835 3808 } 3836 3809 } 3837 3810 ··· 3873 3850 retval = ext4_mark_inode_dirty(handle, whiteout); 3874 3851 if (unlikely(retval)) 3875 3852 goto end_rename; 3853 + 3876 3854 } 3877 3855 if (!new.bh) { 3878 3856 retval = ext4_add_entry(handle, new.dentry, old.inode); ··· 3947 3923 ext4_fc_track_unlink(handle, new.dentry); 3948 3924 __ext4_fc_track_link(handle, old.inode, new.dentry); 3949 3925 __ext4_fc_track_unlink(handle, old.inode, old.dentry); 3926 + if (whiteout) 3927 + __ext4_fc_track_create(handle, whiteout, old.dentry); 3950 3928 } 3951 3929 3952 3930 if (new.inode) { ··· 3963 3937 end_rename: 3964 3938 if (whiteout) { 3965 3939 if (retval) { 3966 - ext4_setent(handle, &old, 3967 - old.inode->i_ino, old_file_type); 3940 + ext4_resetent(handle, &old, 3941 + old.inode->i_ino, old_file_type); 3968 3942 drop_nlink(whiteout); 3943 + ext4_orphan_add(handle, whiteout); 3969 3944 } 3970 3945 unlock_new_inode(whiteout); 3946 + ext4_journal_stop(handle); 3971 3947 iput(whiteout); 3972 - 3948 + } else { 3949 + ext4_journal_stop(handle); 3973 3950 } 3951 + release_bh: 3974 3952 brelse(old.dir_bh); 3975 3953 brelse(old.bh); 3976 3954 brelse(new.bh); 3977 - if (handle) 3978 - ext4_journal_stop(handle); 3979 3955 return retval; 3980 3956 } 3981 3957

+6 -1

fs/ext4/super.c

··· 1210 1210 percpu_counter_destroy(&sbi->s_freeinodes_counter); 1211 1211 percpu_counter_destroy(&sbi->s_dirs_counter); 1212 1212 percpu_counter_destroy(&sbi->s_dirtyclusters_counter); 1213 + percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); 1213 1214 percpu_free_rwsem(&sbi->s_writepages_rwsem); 1214 1215 #ifdef CONFIG_QUOTA 1215 1216 for (i = 0; i < EXT4_MAXQUOTAS; i++) ··· 5013 5012 err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, 5014 5013 GFP_KERNEL); 5015 5014 if (!err) 5015 + err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, 5016 + GFP_KERNEL); 5017 + if (!err) 5016 5018 err = percpu_init_rwsem(&sbi->s_writepages_rwsem); 5017 5019 5018 5020 if (err) { ··· 5128 5124 percpu_counter_destroy(&sbi->s_freeinodes_counter); 5129 5125 percpu_counter_destroy(&sbi->s_dirs_counter); 5130 5126 percpu_counter_destroy(&sbi->s_dirtyclusters_counter); 5127 + percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); 5131 5128 percpu_free_rwsem(&sbi->s_writepages_rwsem); 5132 5129 failed_mount5: 5133 5130 ext4_ext_release(sb); ··· 5154 5149 failed_mount3a: 5155 5150 ext4_es_unregister_shrinker(sbi); 5156 5151 failed_mount3: 5157 - del_timer_sync(&sbi->s_err_report); 5158 5152 flush_work(&sbi->s_error_work); 5153 + del_timer_sync(&sbi->s_err_report); 5159 5154 if (sbi->s_mmp_tsk) 5160 5155 kthread_stop(sbi->s_mmp_tsk); 5161 5156 failed_mount2:

+7

fs/ext4/sysfs.c

··· 24 24 attr_session_write_kbytes, 25 25 attr_lifetime_write_kbytes, 26 26 attr_reserved_clusters, 27 + attr_sra_exceeded_retry_limit, 27 28 attr_inode_readahead, 28 29 attr_trigger_test_error, 29 30 attr_first_error_time, ··· 203 202 EXT4_ATTR_FUNC(session_write_kbytes, 0444); 204 203 EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444); 205 204 EXT4_ATTR_FUNC(reserved_clusters, 0644); 205 + EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444); 206 206 207 207 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead, 208 208 ext4_sb_info, s_inode_readahead_blks); ··· 253 251 ATTR_LIST(session_write_kbytes), 254 252 ATTR_LIST(lifetime_write_kbytes), 255 253 ATTR_LIST(reserved_clusters), 254 + ATTR_LIST(sra_exceeded_retry_limit), 256 255 ATTR_LIST(inode_readahead_blks), 257 256 ATTR_LIST(inode_goal), 258 257 ATTR_LIST(mb_stats), ··· 377 374 return snprintf(buf, PAGE_SIZE, "%llu\n", 378 375 (unsigned long long) 379 376 atomic64_read(&sbi->s_resv_clusters)); 377 + case attr_sra_exceeded_retry_limit: 378 + return snprintf(buf, PAGE_SIZE, "%llu\n", 379 + (unsigned long long) 380 + percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit)); 380 381 case attr_inode_readahead: 381 382 case attr_pointer_ui: 382 383 if (!ptr)

+57 -36

fs/ext4/verity.c

··· 201 201 struct inode *inode = file_inode(filp); 202 202 const int credits = 2; /* superblock and inode for ext4_orphan_del() */ 203 203 handle_t *handle; 204 + struct ext4_iloc iloc; 204 205 int err = 0; 205 - int err2; 206 - 207 - if (desc != NULL) { 208 - /* Succeeded; write the verity descriptor. */ 209 - err = ext4_write_verity_descriptor(inode, desc, desc_size, 210 - merkle_tree_size); 211 - 212 - /* Write all pages before clearing VERITY_IN_PROGRESS. */ 213 - if (!err) 214 - err = filemap_write_and_wait(inode->i_mapping); 215 - } 216 - 217 - /* If we failed, truncate anything we wrote past i_size. */ 218 - if (desc == NULL || err) 219 - ext4_truncate(inode); 220 206 221 207 /* 222 - * We must always clean up by clearing EXT4_STATE_VERITY_IN_PROGRESS and 223 - * deleting the inode from the orphan list, even if something failed. 224 - * If everything succeeded, we'll also set the verity bit in the same 225 - * transaction. 208 + * If an error already occurred (which fs/verity/ signals by passing 209 + * desc == NULL), then only clean-up is needed. 226 210 */ 211 + if (desc == NULL) 212 + goto cleanup; 227 213 228 - ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); 214 + /* Append the verity descriptor. */ 215 + err = ext4_write_verity_descriptor(inode, desc, desc_size, 216 + merkle_tree_size); 217 + if (err) 218 + goto cleanup; 219 + 220 + /* 221 + * Write all pages (both data and verity metadata). Note that this must 222 + * happen before clearing EXT4_STATE_VERITY_IN_PROGRESS; otherwise pages 223 + * beyond i_size won't be written properly. For crash consistency, this 224 + * also must happen before the verity inode flag gets persisted. 225 + */ 226 + err = filemap_write_and_wait(inode->i_mapping); 227 + if (err) 228 + goto cleanup; 229 + 230 + /* 231 + * Finally, set the verity inode flag and remove the inode from the 232 + * orphan list (in a single transaction). 233 + */ 229 234 230 235 handle = ext4_journal_start(inode, EXT4_HT_INODE, credits); 231 236 if (IS_ERR(handle)) { 232 - ext4_orphan_del(NULL, inode); 233 - return PTR_ERR(handle); 237 + err = PTR_ERR(handle); 238 + goto cleanup; 234 239 } 235 240 236 - err2 = ext4_orphan_del(handle, inode); 237 - if (err2) 238 - goto out_stop; 241 + err = ext4_orphan_del(handle, inode); 242 + if (err) 243 + goto stop_and_cleanup; 239 244 240 - if (desc != NULL && !err) { 241 - struct ext4_iloc iloc; 245 + err = ext4_reserve_inode_write(handle, inode, &iloc); 246 + if (err) 247 + goto stop_and_cleanup; 242 248 243 - err = ext4_reserve_inode_write(handle, inode, &iloc); 244 - if (err) 245 - goto out_stop; 246 - ext4_set_inode_flag(inode, EXT4_INODE_VERITY); 247 - ext4_set_inode_flags(inode, false); 248 - err = ext4_mark_iloc_dirty(handle, inode, &iloc); 249 - } 250 - out_stop: 249 + ext4_set_inode_flag(inode, EXT4_INODE_VERITY); 250 + ext4_set_inode_flags(inode, false); 251 + err = ext4_mark_iloc_dirty(handle, inode, &iloc); 252 + if (err) 253 + goto stop_and_cleanup; 254 + 251 255 ext4_journal_stop(handle); 252 - return err ?: err2; 256 + 257 + ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); 258 + return 0; 259 + 260 + stop_and_cleanup: 261 + ext4_journal_stop(handle); 262 + cleanup: 263 + /* 264 + * Verity failed to be enabled, so clean up by truncating any verity 265 + * metadata that was written beyond i_size (both from cache and from 266 + * disk), removing the inode from the orphan list (if it wasn't done 267 + * already), and clearing EXT4_STATE_VERITY_IN_PROGRESS. 268 + */ 269 + truncate_inode_pages(inode->i_mapping, inode->i_size); 270 + ext4_truncate(inode); 271 + ext4_orphan_del(NULL, inode); 272 + ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); 273 + return err; 253 274 } 254 275 255 276 static int ext4_get_verity_descriptor_location(struct inode *inode,

+5 -1

fs/ext4/xattr.c

··· 1462 1462 if (!ce) 1463 1463 return NULL; 1464 1464 1465 + WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) && 1466 + !(current->flags & PF_MEMALLOC_NOFS)); 1467 + 1465 1468 ea_data = kvmalloc(value_len, GFP_KERNEL); 1466 1469 if (!ea_data) { 1467 1470 mb_cache_entry_put(ea_inode_cache, ce); ··· 2330 2327 error = -ENOSPC; 2331 2328 goto cleanup; 2332 2329 } 2330 + WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS)); 2333 2331 } 2334 2332 2335 2333 error = ext4_reserve_inode_write(handle, inode, &is.iloc); ··· 2404 2400 * external inode if possible. 2405 2401 */ 2406 2402 if (ext4_has_feature_ea_inode(inode->i_sb) && 2407 - !i.in_inode) { 2403 + i.value_len && !i.in_inode) { 2408 2404 i.in_inode = 1; 2409 2405 goto retry_inode; 2410 2406 }