commit cc883236b79297f6266ca6f4e7f24f3fd3c736c1 · tjh.dev/kernel

tjh.dev / kernel

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

ext4: drop unnecessary journal handle in delalloc write

After we factor out the inline data write procedure from
ext4_da_write_end(), we don't need to start journal handle for the cases
of both buffer overwrite and append-write. If we need to update
i_disksize, mark_inode_dirty() do start handle and update inode buffer.
So we could just remove all the journal handle codes in the delalloc
write procedure.

After this patch, we could get a lot of performance improvement. Below
is the Unixbench comparison data test on my machine with 'Intel Xeon
Gold 5120' CPU and nvme SSD backend.

Test cmd:

./Run -c 56 -i 3 fstime fsbuffer fsdisk

Before this patch:

System Benchmarks Partial Index BASELINE RESULT INDEX
File Copy 1024 bufsize 2000 maxblocks 3960.0 422965.0 1068.1
File Copy 256 bufsize 500 maxblocks 1655.0 105077.0 634.9
File Copy 4096 bufsize 8000 maxblocks 5800.0 1429092.0 2464.0
======
System Benchmarks Index Score (Partial Only) 1186.6

After this patch:

System Benchmarks Partial Index BASELINE RESULT INDEX
File Copy 1024 bufsize 2000 maxblocks 3960.0 732716.0 1850.3
File Copy 256 bufsize 500 maxblocks 1655.0 184940.0 1117.5
File Copy 4096 bufsize 8000 maxblocks 5800.0 2427152.0 4184.7
======
System Benchmarks Index Score (Partial Only) 2053.0

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Link: https://lore.kernel.org/r/20210716122024.1105856-5-yi.zhang@huawei.com

authored by Zhang Yi and committed by Theodore Ts'o 4 years ago cc883236 6984aef5

+5 -55

1 changed file

expand all

unified split

ext4

inode.c

+5 -55

fs/ext4/inode.c

··· 2910 2910 return 0; 2911 2911 } 2912 2912 2913 - /* We always reserve for an inode update; the superblock could be there too */ 2914 - static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len) 2915 - { 2916 - if (likely(ext4_has_feature_large_file(inode->i_sb))) 2917 - return 1; 2918 - 2919 - if (pos + len <= 0x7fffffffULL) 2920 - return 1; 2921 - 2922 - /* We might need to update the superblock to set LARGE_FILE */ 2923 - return 2; 2924 - } 2925 - 2926 2913 static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 2927 2914 loff_t pos, unsigned len, unsigned flags, 2928 2915 struct page **pagep, void **fsdata) ··· 2918 2931 struct page *page; 2919 2932 pgoff_t index; 2920 2933 struct inode *inode = mapping->host; 2921 - handle_t *handle; 2922 2934 2923 2935 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 2924 2936 return -EIO; ··· 2943 2957 return 0; 2944 2958 } 2945 2959 2946 - /* 2947 - * grab_cache_page_write_begin() can take a long time if the 2948 - * system is thrashing due to memory pressure, or if the page 2949 - * is being written back. So grab it first before we start 2950 - * the transaction handle. This also allows us to allocate 2951 - * the page (if needed) without using GFP_NOFS. 2952 - */ 2953 - retry_grab: 2960 + retry: 2954 2961 page = grab_cache_page_write_begin(mapping, index, flags); 2955 2962 if (!page) 2956 2963 return -ENOMEM; 2957 - unlock_page(page); 2958 2964 2959 - /* 2960 - * With delayed allocation, we don't log the i_disksize update 2961 - * if there is delayed block allocation. But we still need 2962 - * to journalling the i_disksize update if writes to the end 2963 - * of file which has an already mapped buffer. 2964 - */ 2965 - retry_journal: 2966 - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 2967 - ext4_da_write_credits(inode, pos, len)); 2968 - if (IS_ERR(handle)) { 2969 - put_page(page); 2970 - return PTR_ERR(handle); 2971 - } 2972 - 2973 - lock_page(page); 2974 - if (page->mapping != mapping) { 2975 - /* The page got truncated from under us */ 2976 - unlock_page(page); 2977 - put_page(page); 2978 - ext4_journal_stop(handle); 2979 - goto retry_grab; 2980 - } 2981 2965 /* In case writeback began while the page was unlocked */ 2982 2966 wait_for_stable_page(page); 2983 2967 ··· 2959 3003 #endif 2960 3004 if (ret < 0) { 2961 3005 unlock_page(page); 2962 - ext4_journal_stop(handle); 3006 + put_page(page); 2963 3007 /* 2964 3008 * block_write_begin may have instantiated a few blocks 2965 3009 * outside i_size. Trim these off again. Don't need 2966 - * i_size_read because we hold i_mutex. 3010 + * i_size_read because we hold inode lock. 2967 3011 */ 2968 3012 if (pos + len > inode->i_size) 2969 3013 ext4_truncate_failed_write(inode); 2970 3014 2971 3015 if (ret == -ENOSPC && 2972 3016 ext4_should_retry_alloc(inode->i_sb, &retries)) 2973 - goto retry_journal; 2974 - 2975 - put_page(page); 3017 + goto retry; 2976 3018 return ret; 2977 3019 } 2978 3020 ··· 3007 3053 struct page *page, void *fsdata) 3008 3054 { 3009 3055 struct inode *inode = mapping->host; 3010 - int ret; 3011 - handle_t *handle = ext4_journal_current_handle(); 3012 3056 loff_t new_i_size; 3013 3057 unsigned long start, end; 3014 3058 int write_mode = (int)(unsigned long)fsdata; ··· 3045 3093 ext4_da_should_update_i_disksize(page, end)) 3046 3094 ext4_update_i_disksize(inode, new_i_size); 3047 3095 3048 - copied = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 3049 - ret = ext4_journal_stop(handle); 3050 - return ret ? ret : copied; 3096 + return generic_write_end(file, mapping, pos, len, copied, page, fsdata); 3051 3097 } 3052 3098 3053 3099 /*