Merge branch 'delalloc-buffer-write' into dev

Fix a bug in how we update i_disksize, and the error path in
inline_data_end. Finally, drop an unnecessary creation of a journal
handle which was only needed for inline data, which can give us a
large performance gain in delayed allocation writes.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>

+106 -180
-3
fs/ext4/ext4.h
··· 3603 unsigned flags, 3604 struct page **pagep, 3605 void **fsdata); 3606 - extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, 3607 - unsigned len, unsigned copied, 3608 - struct page *page); 3609 extern int ext4_try_add_inline_entry(handle_t *handle, 3610 struct ext4_filename *fname, 3611 struct inode *dir, struct inode *inode);
··· 3603 unsigned flags, 3604 struct page **pagep, 3605 void **fsdata); 3606 extern int ext4_try_add_inline_entry(handle_t *handle, 3607 struct ext4_filename *fname, 3608 struct inode *dir, struct inode *inode);
+66 -65
fs/ext4/inline.c
··· 733 int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, 734 unsigned copied, struct page *page) 735 { 736 - int ret, no_expand; 737 void *kaddr; 738 struct ext4_iloc iloc; 739 740 - if (unlikely(copied < len)) { 741 - if (!PageUptodate(page)) { 742 - copied = 0; 743 goto out; 744 } 745 - } 746 747 - ret = ext4_get_inode_loc(inode, &iloc); 748 - if (ret) { 749 - ext4_std_error(inode->i_sb, ret); 750 - copied = 0; 751 - goto out; 752 - } 753 754 - ext4_write_lock_xattr(inode, &no_expand); 755 - BUG_ON(!ext4_has_inline_data(inode)); 756 757 /* 758 - * ei->i_inline_off may have changed since ext4_write_begin() 759 - * called ext4_try_to_write_inline_data() 760 */ 761 - (void) ext4_find_inline_data_nolock(inode); 762 - 763 - kaddr = kmap_atomic(page); 764 - ext4_write_inline_data(inode, &iloc, kaddr, pos, len); 765 - kunmap_atomic(kaddr); 766 - SetPageUptodate(page); 767 - /* clear page dirty so that writepages wouldn't work for us. */ 768 - ClearPageDirty(page); 769 - 770 - ext4_write_unlock_xattr(inode, &no_expand); 771 - brelse(iloc.bh); 772 - mark_inode_dirty(inode); 773 out: 774 - return copied; 775 } 776 777 struct buffer_head * ··· 989 out: 990 brelse(iloc.bh); 991 return ret; 992 - } 993 - 994 - int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, 995 - unsigned len, unsigned copied, 996 - struct page *page) 997 - { 998 - int ret; 999 - 1000 - ret = ext4_write_inline_data_end(inode, pos, len, copied, page); 1001 - if (ret < 0) { 1002 - unlock_page(page); 1003 - put_page(page); 1004 - return ret; 1005 - } 1006 - copied = ret; 1007 - 1008 - /* 1009 - * No need to use i_size_read() here, the i_size 1010 - * cannot change under us because we hold i_mutex. 1011 - * 1012 - * But it's important to update i_size while still holding page lock: 1013 - * page writeout could otherwise come in and zero beyond i_size. 1014 - */ 1015 - if (pos+copied > inode->i_size) 1016 - i_size_write(inode, pos+copied); 1017 - unlock_page(page); 1018 - put_page(page); 1019 - 1020 - /* 1021 - * Don't mark the inode dirty under page lock. First, it unnecessarily 1022 - * makes the holding time of page lock longer. Second, it forces lock 1023 - * ordering of page lock and transaction start for journaling 1024 - * filesystems. 1025 - */ 1026 - mark_inode_dirty(inode); 1027 - 1028 - return copied; 1029 } 1030 1031 #ifdef INLINE_DIR_DEBUG
··· 733 int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, 734 unsigned copied, struct page *page) 735 { 736 + handle_t *handle = ext4_journal_current_handle(); 737 + int no_expand; 738 void *kaddr; 739 struct ext4_iloc iloc; 740 + int ret = 0, ret2; 741 742 + if (unlikely(copied < len) && !PageUptodate(page)) 743 + copied = 0; 744 + 745 + if (likely(copied)) { 746 + ret = ext4_get_inode_loc(inode, &iloc); 747 + if (ret) { 748 + unlock_page(page); 749 + put_page(page); 750 + ext4_std_error(inode->i_sb, ret); 751 goto out; 752 } 753 + ext4_write_lock_xattr(inode, &no_expand); 754 + BUG_ON(!ext4_has_inline_data(inode)); 755 756 + /* 757 + * ei->i_inline_off may have changed since 758 + * ext4_write_begin() called 759 + * ext4_try_to_write_inline_data() 760 + */ 761 + (void) ext4_find_inline_data_nolock(inode); 762 763 + kaddr = kmap_atomic(page); 764 + ext4_write_inline_data(inode, &iloc, kaddr, pos, copied); 765 + kunmap_atomic(kaddr); 766 + SetPageUptodate(page); 767 + /* clear page dirty so that writepages wouldn't work for us. */ 768 + ClearPageDirty(page); 769 + 770 + ext4_write_unlock_xattr(inode, &no_expand); 771 + brelse(iloc.bh); 772 + 773 + /* 774 + * It's important to update i_size while still holding page 775 + * lock: page writeout could otherwise come in and zero 776 + * beyond i_size. 777 + */ 778 + ext4_update_inode_size(inode, pos + copied); 779 + } 780 + unlock_page(page); 781 + put_page(page); 782 783 /* 784 + * Don't mark the inode dirty under page lock. First, it unnecessarily 785 + * makes the holding time of page lock longer. Second, it forces lock 786 + * ordering of page lock and transaction start for journaling 787 + * filesystems. 788 */ 789 + if (likely(copied)) 790 + mark_inode_dirty(inode); 791 out: 792 + /* 793 + * If we didn't copy as much data as expected, we need to trim back 794 + * size of xattr containing inline data. 795 + */ 796 + if (pos + len > inode->i_size && ext4_can_truncate(inode)) 797 + ext4_orphan_add(handle, inode); 798 + 799 + ret2 = ext4_journal_stop(handle); 800 + if (!ret) 801 + ret = ret2; 802 + if (pos + len > inode->i_size) { 803 + ext4_truncate_failed_write(inode); 804 + /* 805 + * If truncate failed early the inode might still be 806 + * on the orphan list; we need to make sure the inode 807 + * is removed from the orphan list in that case. 808 + */ 809 + if (inode->i_nlink) 810 + ext4_orphan_del(NULL, inode); 811 + } 812 + return ret ? ret : copied; 813 } 814 815 struct buffer_head * ··· 951 out: 952 brelse(iloc.bh); 953 return ret; 954 } 955 956 #ifdef INLINE_DIR_DEBUG
+40 -112
fs/ext4/inode.c
··· 1284 loff_t old_size = inode->i_size; 1285 int ret = 0, ret2; 1286 int i_size_changed = 0; 1287 - int inline_data = ext4_has_inline_data(inode); 1288 bool verity = ext4_verity_in_progress(inode); 1289 1290 trace_ext4_write_end(inode, pos, len, copied); 1291 - if (inline_data) { 1292 - ret = ext4_write_inline_data_end(inode, pos, len, 1293 - copied, page); 1294 - if (ret < 0) { 1295 - unlock_page(page); 1296 - put_page(page); 1297 - goto errout; 1298 - } 1299 - copied = ret; 1300 - } else 1301 - copied = block_write_end(file, mapping, pos, 1302 - len, copied, page, fsdata); 1303 /* 1304 * it's important to update i_size while still holding page lock: 1305 * page writeout could otherwise come in and zero beyond i_size. ··· 1312 * ordering of page lock and transaction start for journaling 1313 * filesystems. 1314 */ 1315 - if (i_size_changed || inline_data) 1316 ret = ext4_mark_inode_dirty(handle, inode); 1317 1318 if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) ··· 1321 * inode->i_size. So truncate them 1322 */ 1323 ext4_orphan_add(handle, inode); 1324 - errout: 1325 ret2 = ext4_journal_stop(handle); 1326 if (!ret) 1327 ret = ret2; ··· 1387 int partial = 0; 1388 unsigned from, to; 1389 int size_changed = 0; 1390 - int inline_data = ext4_has_inline_data(inode); 1391 bool verity = ext4_verity_in_progress(inode); 1392 1393 trace_ext4_journalled_write_end(inode, pos, len, copied); ··· 1395 1396 BUG_ON(!ext4_handle_valid(handle)); 1397 1398 - if (inline_data) { 1399 - ret = ext4_write_inline_data_end(inode, pos, len, 1400 - copied, page); 1401 - if (ret < 0) { 1402 - unlock_page(page); 1403 - put_page(page); 1404 - goto errout; 1405 - } 1406 - copied = ret; 1407 - } else if (unlikely(copied < len) && !PageUptodate(page)) { 1408 copied = 0; 1409 ext4_journalled_zero_new_buffers(handle, inode, page, from, to); 1410 } else { ··· 1421 if (old_size < pos && !verity) 1422 pagecache_isize_extended(inode, old_size, pos); 1423 1424 - if (size_changed || inline_data) { 1425 ret2 = ext4_mark_inode_dirty(handle, inode); 1426 if (!ret) 1427 ret = ret2; ··· 1434 */ 1435 ext4_orphan_add(handle, inode); 1436 1437 - errout: 1438 ret2 = ext4_journal_stop(handle); 1439 if (!ret) 1440 ret = ret2; ··· 2916 return 0; 2917 } 2918 2919 - /* We always reserve for an inode update; the superblock could be there too */ 2920 - static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len) 2921 - { 2922 - if (likely(ext4_has_feature_large_file(inode->i_sb))) 2923 - return 1; 2924 - 2925 - if (pos + len <= 0x7fffffffULL) 2926 - return 1; 2927 - 2928 - /* We might need to update the superblock to set LARGE_FILE */ 2929 - return 2; 2930 - } 2931 - 2932 static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 2933 loff_t pos, unsigned len, unsigned flags, 2934 struct page **pagep, void **fsdata) ··· 2924 struct page *page; 2925 pgoff_t index; 2926 struct inode *inode = mapping->host; 2927 - handle_t *handle; 2928 2929 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 2930 return -EIO; ··· 2949 return 0; 2950 } 2951 2952 - /* 2953 - * grab_cache_page_write_begin() can take a long time if the 2954 - * system is thrashing due to memory pressure, or if the page 2955 - * is being written back. So grab it first before we start 2956 - * the transaction handle. This also allows us to allocate 2957 - * the page (if needed) without using GFP_NOFS. 2958 - */ 2959 - retry_grab: 2960 page = grab_cache_page_write_begin(mapping, index, flags); 2961 if (!page) 2962 return -ENOMEM; 2963 - unlock_page(page); 2964 2965 - /* 2966 - * With delayed allocation, we don't log the i_disksize update 2967 - * if there is delayed block allocation. But we still need 2968 - * to journalling the i_disksize update if writes to the end 2969 - * of file which has an already mapped buffer. 2970 - */ 2971 - retry_journal: 2972 - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 2973 - ext4_da_write_credits(inode, pos, len)); 2974 - if (IS_ERR(handle)) { 2975 - put_page(page); 2976 - return PTR_ERR(handle); 2977 - } 2978 - 2979 - lock_page(page); 2980 - if (page->mapping != mapping) { 2981 - /* The page got truncated from under us */ 2982 - unlock_page(page); 2983 - put_page(page); 2984 - ext4_journal_stop(handle); 2985 - goto retry_grab; 2986 - } 2987 /* In case writeback began while the page was unlocked */ 2988 wait_for_stable_page(page); 2989 ··· 2965 #endif 2966 if (ret < 0) { 2967 unlock_page(page); 2968 - ext4_journal_stop(handle); 2969 /* 2970 * block_write_begin may have instantiated a few blocks 2971 * outside i_size. Trim these off again. Don't need 2972 - * i_size_read because we hold i_mutex. 2973 */ 2974 if (pos + len > inode->i_size) 2975 ext4_truncate_failed_write(inode); 2976 2977 if (ret == -ENOSPC && 2978 ext4_should_retry_alloc(inode->i_sb, &retries)) 2979 - goto retry_journal; 2980 - 2981 - put_page(page); 2982 return ret; 2983 } 2984 ··· 3013 struct page *page, void *fsdata) 3014 { 3015 struct inode *inode = mapping->host; 3016 - int ret = 0, ret2; 3017 - handle_t *handle = ext4_journal_current_handle(); 3018 loff_t new_i_size; 3019 unsigned long start, end; 3020 int write_mode = (int)(unsigned long)fsdata; ··· 3022 len, copied, page, fsdata); 3023 3024 trace_ext4_da_write_end(inode, pos, len, copied); 3025 - start = pos & (PAGE_SIZE - 1); 3026 - end = start + copied - 1; 3027 - 3028 - /* 3029 - * generic_write_end() will run mark_inode_dirty() if i_size 3030 - * changes. So let's piggyback the i_disksize mark_inode_dirty 3031 - * into that. 3032 - */ 3033 - new_i_size = pos + copied; 3034 - if (copied && new_i_size > EXT4_I(inode)->i_disksize) { 3035 - if (ext4_has_inline_data(inode) || 3036 - ext4_da_should_update_i_disksize(page, end)) { 3037 - ext4_update_i_disksize(inode, new_i_size); 3038 - /* We need to mark inode dirty even if 3039 - * new_i_size is less that inode->i_size 3040 - * bu greater than i_disksize.(hint delalloc) 3041 - */ 3042 - ret = ext4_mark_inode_dirty(handle, inode); 3043 - } 3044 - } 3045 3046 if (write_mode != CONVERT_INLINE_DATA && 3047 ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && 3048 ext4_has_inline_data(inode)) 3049 - ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied, 3050 - page); 3051 - else 3052 - ret2 = generic_write_end(file, mapping, pos, len, copied, 3053 - page, fsdata); 3054 3055 - copied = ret2; 3056 - if (ret2 < 0) 3057 - ret = ret2; 3058 - ret2 = ext4_journal_stop(handle); 3059 - if (unlikely(ret2 && !ret)) 3060 - ret = ret2; 3061 3062 - return ret ? ret : copied; 3063 } 3064 3065 /*
··· 1284 loff_t old_size = inode->i_size; 1285 int ret = 0, ret2; 1286 int i_size_changed = 0; 1287 bool verity = ext4_verity_in_progress(inode); 1288 1289 trace_ext4_write_end(inode, pos, len, copied); 1290 + 1291 + if (ext4_has_inline_data(inode)) 1292 + return ext4_write_inline_data_end(inode, pos, len, copied, page); 1293 + 1294 + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 1295 /* 1296 * it's important to update i_size while still holding page lock: 1297 * page writeout could otherwise come in and zero beyond i_size. ··· 1320 * ordering of page lock and transaction start for journaling 1321 * filesystems. 1322 */ 1323 + if (i_size_changed) 1324 ret = ext4_mark_inode_dirty(handle, inode); 1325 1326 if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) ··· 1329 * inode->i_size. So truncate them 1330 */ 1331 ext4_orphan_add(handle, inode); 1332 + 1333 ret2 = ext4_journal_stop(handle); 1334 if (!ret) 1335 ret = ret2; ··· 1395 int partial = 0; 1396 unsigned from, to; 1397 int size_changed = 0; 1398 bool verity = ext4_verity_in_progress(inode); 1399 1400 trace_ext4_journalled_write_end(inode, pos, len, copied); ··· 1404 1405 BUG_ON(!ext4_handle_valid(handle)); 1406 1407 + if (ext4_has_inline_data(inode)) 1408 + return ext4_write_inline_data_end(inode, pos, len, copied, page); 1409 + 1410 + if (unlikely(copied < len) && !PageUptodate(page)) { 1411 copied = 0; 1412 ext4_journalled_zero_new_buffers(handle, inode, page, from, to); 1413 } else { ··· 1436 if (old_size < pos && !verity) 1437 pagecache_isize_extended(inode, old_size, pos); 1438 1439 + if (size_changed) { 1440 ret2 = ext4_mark_inode_dirty(handle, inode); 1441 if (!ret) 1442 ret = ret2; ··· 1449 */ 1450 ext4_orphan_add(handle, inode); 1451 1452 ret2 = ext4_journal_stop(handle); 1453 if (!ret) 1454 ret = ret2; ··· 2932 return 0; 2933 } 2934 2935 static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 2936 loff_t pos, unsigned len, unsigned flags, 2937 struct page **pagep, void **fsdata) ··· 2953 struct page *page; 2954 pgoff_t index; 2955 struct inode *inode = mapping->host; 2956 2957 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 2958 return -EIO; ··· 2979 return 0; 2980 } 2981 2982 + retry: 2983 page = grab_cache_page_write_begin(mapping, index, flags); 2984 if (!page) 2985 return -ENOMEM; 2986 2987 /* In case writeback began while the page was unlocked */ 2988 wait_for_stable_page(page); 2989 ··· 3025 #endif 3026 if (ret < 0) { 3027 unlock_page(page); 3028 + put_page(page); 3029 /* 3030 * block_write_begin may have instantiated a few blocks 3031 * outside i_size. Trim these off again. Don't need 3032 + * i_size_read because we hold inode lock. 3033 */ 3034 if (pos + len > inode->i_size) 3035 ext4_truncate_failed_write(inode); 3036 3037 if (ret == -ENOSPC && 3038 ext4_should_retry_alloc(inode->i_sb, &retries)) 3039 + goto retry; 3040 return ret; 3041 } 3042 ··· 3075 struct page *page, void *fsdata) 3076 { 3077 struct inode *inode = mapping->host; 3078 loff_t new_i_size; 3079 unsigned long start, end; 3080 int write_mode = (int)(unsigned long)fsdata; ··· 3086 len, copied, page, fsdata); 3087 3088 trace_ext4_da_write_end(inode, pos, len, copied); 3089 3090 if (write_mode != CONVERT_INLINE_DATA && 3091 ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && 3092 ext4_has_inline_data(inode)) 3093 + return ext4_write_inline_data_end(inode, pos, len, copied, page); 3094 3095 + start = pos & (PAGE_SIZE - 1); 3096 + end = start + copied - 1; 3097 3098 + /* 3099 + * Since we are holding inode lock, we are sure i_disksize <= 3100 + * i_size. We also know that if i_disksize < i_size, there are 3101 + * delalloc writes pending in the range upto i_size. If the end of 3102 + * the current write is <= i_size, there's no need to touch 3103 + * i_disksize since writeback will push i_disksize upto i_size 3104 + * eventually. If the end of the current write is > i_size and 3105 + * inside an allocated block (ext4_da_should_update_i_disksize() 3106 + * check), we need to update i_disksize here as neither 3107 + * ext4_writepage() nor certain ext4_writepages() paths not 3108 + * allocating blocks update i_disksize. 3109 + * 3110 + * Note that we defer inode dirtying to generic_write_end() / 3111 + * ext4_da_write_inline_data_end(). 3112 + */ 3113 + new_i_size = pos + copied; 3114 + if (copied && new_i_size > inode->i_size && 3115 + ext4_da_should_update_i_disksize(page, end)) 3116 + ext4_update_i_disksize(inode, new_i_size); 3117 + 3118 + return generic_write_end(file, mapping, pos, len, copied, page, fsdata); 3119 } 3120 3121 /*