Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:
"There are a number of major cleanups in ext4 this cycle:

- The data=journal writepath has been significantly cleaned up and
simplified, and reduces a large number of data=journal special
cases by Jan Kara.

- Ojaswin Muhoo has replaced linked list used to track extents that
have been used for inode preallocation with a red-black tree in the
multi-block allocator. This improves performance for workloads
which do a large number of random allocating writes.

- Thanks to Kemeng Shi for a lot of cleanup and bug fixes in the
multi-block allocator.

- Matthew wilcox has converted the code paths for reading and writing
ext4 pages to use folios.

- Jason Yan has continued to factor out ext4_fill_super() into
smaller functions for improve ease of maintenance and
comprehension.

- Josh Triplett has created an uapi header for ext4 userspace API's"

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (105 commits)
ext4: Add a uapi header for ext4 userspace APIs
ext4: remove useless conditional branch code
ext4: remove unneeded check of nr_to_submit
ext4: move dax and encrypt checking into ext4_check_feature_compatibility()
ext4: factor out ext4_block_group_meta_init()
ext4: move s_reserved_gdt_blocks and addressable checking into ext4_check_geometry()
ext4: rename two functions with 'check'
ext4: factor out ext4_flex_groups_free()
ext4: use ext4_group_desc_free() in ext4_put_super() to save some duplicated code
ext4: factor out ext4_percpu_param_init() and ext4_percpu_param_destroy()
ext4: factor out ext4_hash_info_init()
Revert "ext4: Fix warnings when freezing filesystem with journaled data"
ext4: Update comment in mpage_prepare_extent_to_map()
ext4: Simplify handling of journalled data in ext4_bmap()
ext4: Drop special handling of journalled data from ext4_quota_on()
ext4: Drop special handling of journalled data from ext4_evict_inode()
ext4: Fix special handling of journalled data from extent zeroing
ext4: Drop special handling of journalled data from extent shifting operations
ext4: Drop special handling of journalled data from ext4_sync_file()
ext4: Commit transaction before writing back pages in data=journal mode
...

+1414 -1442
-3
Documentation/admin-guide/ext4.rst
··· 489 489 multiple of this tuning parameter if the stripe size is not set in the 490 490 ext4 superblock 491 491 492 - mb_max_inode_prealloc 493 - The maximum length of per-inode ext4_prealloc_space list. 494 - 495 492 mb_max_to_scan 496 493 The maximum number of extents the multiblock allocator will search to 497 494 find the best extent.
+1
MAINTAINERS
··· 7745 7745 F: Documentation/filesystems/ext4/ 7746 7746 F: fs/ext4/ 7747 7747 F: include/trace/events/ext4.h 7748 + F: include/uapi/linux/ext4.h 7748 7749 7749 7750 Extended Verification Module (EVM) 7750 7751 M: Mimi Zohar <zohar@linux.ibm.com>
+1
block/bio.c
··· 1159 1159 return false; 1160 1160 return bio_add_page(bio, &folio->page, len, off) > 0; 1161 1161 } 1162 + EXPORT_SYMBOL(bio_add_folio); 1162 1163 1163 1164 void __bio_release_pages(struct bio *bio, bool mark_dirty) 1164 1165 {
+62 -62
fs/ext4/balloc.c
··· 80 80 return (actual_group == block_group) ? 1 : 0; 81 81 } 82 82 83 - /* Return the number of clusters used for file system metadata; this 83 + /* 84 + * Return the number of clusters used for file system metadata; this 84 85 * represents the overhead needed by the file system. 85 86 */ 86 87 static unsigned ext4_num_overhead_clusters(struct super_block *sb, 87 88 ext4_group_t block_group, 88 89 struct ext4_group_desc *gdp) 89 90 { 90 - unsigned num_clusters; 91 - int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c; 91 + unsigned base_clusters, num_clusters; 92 + int block_cluster = -1, inode_cluster; 93 + int itbl_cluster_start = -1, itbl_cluster_end = -1; 92 94 ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group); 93 - ext4_fsblk_t itbl_blk; 95 + ext4_fsblk_t end = start + EXT4_BLOCKS_PER_GROUP(sb) - 1; 96 + ext4_fsblk_t itbl_blk_start, itbl_blk_end; 94 97 struct ext4_sb_info *sbi = EXT4_SB(sb); 95 98 96 99 /* This is the number of clusters used by the superblock, 97 100 * block group descriptors, and reserved block group 98 101 * descriptor blocks */ 99 - num_clusters = ext4_num_base_meta_clusters(sb, block_group); 102 + base_clusters = ext4_num_base_meta_clusters(sb, block_group); 103 + num_clusters = base_clusters; 100 104 101 105 /* 102 - * For the allocation bitmaps and inode table, we first need 103 - * to check to see if the block is in the block group. If it 104 - * is, then check to see if the cluster is already accounted 105 - * for in the clusters used for the base metadata cluster, or 106 - * if we can increment the base metadata cluster to include 107 - * that block. Otherwise, we will have to track the cluster 108 - * used for the allocation bitmap or inode table explicitly. 106 + * Account and record inode table clusters if any cluster 107 + * is in the block group, or inode table cluster range is 108 + * [-1, -1] and won't overlap with block/inode bitmap cluster 109 + * accounted below. 110 + */ 111 + itbl_blk_start = ext4_inode_table(sb, gdp); 112 + itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1; 113 + if (itbl_blk_start <= end && itbl_blk_end >= start) { 114 + itbl_blk_start = itbl_blk_start >= start ? 115 + itbl_blk_start : start; 116 + itbl_blk_end = itbl_blk_end <= end ? 117 + itbl_blk_end : end; 118 + 119 + itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start); 120 + itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start); 121 + 122 + num_clusters += itbl_cluster_end - itbl_cluster_start + 1; 123 + /* check if border cluster is overlapped */ 124 + if (itbl_cluster_start == base_clusters - 1) 125 + num_clusters--; 126 + } 127 + 128 + /* 129 + * For the allocation bitmaps, we first need to check to see 130 + * if the block is in the block group. If it is, then check 131 + * to see if the cluster is already accounted for in the clusters 132 + * used for the base metadata cluster and inode tables cluster. 109 133 * Normally all of these blocks are contiguous, so the special 110 134 * case handling shouldn't be necessary except for *very* 111 135 * unusual file system layouts. ··· 137 113 if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { 138 114 block_cluster = EXT4_B2C(sbi, 139 115 ext4_block_bitmap(sb, gdp) - start); 140 - if (block_cluster < num_clusters) 141 - block_cluster = -1; 142 - else if (block_cluster == num_clusters) { 116 + if (block_cluster >= base_clusters && 117 + (block_cluster < itbl_cluster_start || 118 + block_cluster > itbl_cluster_end)) 143 119 num_clusters++; 144 - block_cluster = -1; 145 - } 146 120 } 147 121 148 122 if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { 149 123 inode_cluster = EXT4_B2C(sbi, 150 124 ext4_inode_bitmap(sb, gdp) - start); 151 - if (inode_cluster < num_clusters) 152 - inode_cluster = -1; 153 - else if (inode_cluster == num_clusters) { 125 + /* 126 + * Additional check if inode bitmap is in just accounted 127 + * block_cluster 128 + */ 129 + if (inode_cluster != block_cluster && 130 + inode_cluster >= base_clusters && 131 + (inode_cluster < itbl_cluster_start || 132 + inode_cluster > itbl_cluster_end)) 154 133 num_clusters++; 155 - inode_cluster = -1; 156 - } 157 134 } 158 - 159 - itbl_blk = ext4_inode_table(sb, gdp); 160 - for (i = 0; i < sbi->s_itb_per_group; i++) { 161 - if (ext4_block_in_group(sb, itbl_blk + i, block_group)) { 162 - c = EXT4_B2C(sbi, itbl_blk + i - start); 163 - if ((c < num_clusters) || (c == inode_cluster) || 164 - (c == block_cluster) || (c == itbl_cluster)) 165 - continue; 166 - if (c == num_clusters) { 167 - num_clusters++; 168 - continue; 169 - } 170 - num_clusters++; 171 - itbl_cluster = c; 172 - } 173 - } 174 - 175 - if (block_cluster != -1) 176 - num_clusters++; 177 - if (inode_cluster != -1) 178 - num_clusters++; 179 135 180 136 return num_clusters; 181 137 } ··· 191 187 192 188 ASSERT(buffer_locked(bh)); 193 189 194 - /* If checksum is bad mark all blocks used to prevent allocation 195 - * essentially implementing a per-group read-only flag. */ 196 190 if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { 197 191 ext4_mark_group_bitmap_corrupted(sb, block_group, 198 192 EXT4_GROUP_INFO_BBITMAP_CORRUPT | ··· 352 350 blk = ext4_inode_table(sb, desc); 353 351 offset = blk - group_first_block; 354 352 if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || 355 - EXT4_B2C(sbi, offset + sbi->s_itb_per_group) >= max_bit) 353 + EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) >= max_bit) 356 354 return blk; 357 355 next_zero_bit = ext4_find_next_zero_bit(bh->b_data, 358 - EXT4_B2C(sbi, offset + sbi->s_itb_per_group), 356 + EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1, 359 357 EXT4_B2C(sbi, offset)); 360 358 if (next_zero_bit < 361 - EXT4_B2C(sbi, offset + sbi->s_itb_per_group)) 359 + EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1) 362 360 /* bad bitmap for inode tables */ 363 361 return blk; 364 362 return 0; ··· 385 383 ext4_lock_group(sb, block_group); 386 384 if (buffer_verified(bh)) 387 385 goto verified; 388 - if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, 389 - desc, bh) || 386 + if (unlikely(!ext4_block_bitmap_csum_verify(sb, desc, bh) || 390 387 ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) { 391 388 ext4_unlock_group(sb, block_group); 392 389 ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); ··· 475 474 goto out; 476 475 } 477 476 err = ext4_init_block_bitmap(sb, bh, block_group, desc); 477 + if (err) { 478 + ext4_unlock_group(sb, block_group); 479 + unlock_buffer(bh); 480 + ext4_error(sb, "Failed to init block bitmap for group " 481 + "%u: %d", block_group, err); 482 + goto out; 483 + } 478 484 set_bitmap_uptodate(bh); 479 485 set_buffer_uptodate(bh); 480 486 set_buffer_verified(bh); 481 487 ext4_unlock_group(sb, block_group); 482 488 unlock_buffer(bh); 483 - if (err) { 484 - ext4_error(sb, "Failed to init block bitmap for group " 485 - "%u: %d", block_group, err); 486 - goto out; 487 - } 488 - goto verify; 489 + return bh; 489 490 } 490 491 ext4_unlock_group(sb, block_group); 491 492 if (buffer_uptodate(bh)) { ··· 845 842 if (!ext4_bg_has_super(sb, group)) 846 843 return 0; 847 844 848 - if (ext4_has_feature_meta_bg(sb)) 849 - return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); 850 - else 851 - return EXT4_SB(sb)->s_gdb_count; 845 + return EXT4_SB(sb)->s_gdb_count; 852 846 } 853 847 854 848 /** ··· 887 887 block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * 888 888 sbi->s_desc_per_block) { 889 889 if (num) { 890 - num += ext4_bg_num_gdb(sb, block_group); 890 + num += ext4_bg_num_gdb_nometa(sb, block_group); 891 891 num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); 892 892 } 893 893 } else { /* For META_BG_BLOCK_GROUPS */ 894 - num += ext4_bg_num_gdb(sb, block_group); 894 + num += ext4_bg_num_gdb_meta(sb, block_group); 895 895 } 896 896 return EXT4_NUM_B2C(sbi, num); 897 897 }
+5 -8
fs/ext4/bitmap.c
··· 16 16 return numchars * BITS_PER_BYTE - memweight(bitmap, numchars); 17 17 } 18 18 19 - int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, 19 + int ext4_inode_bitmap_csum_verify(struct super_block *sb, 20 20 struct ext4_group_desc *gdp, 21 21 struct buffer_head *bh, int sz) 22 22 { ··· 38 38 return provided == calculated; 39 39 } 40 40 41 - void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, 41 + void ext4_inode_bitmap_csum_set(struct super_block *sb, 42 42 struct ext4_group_desc *gdp, 43 43 struct buffer_head *bh, int sz) 44 44 { ··· 54 54 gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); 55 55 } 56 56 57 - int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, 57 + int ext4_block_bitmap_csum_verify(struct super_block *sb, 58 58 struct ext4_group_desc *gdp, 59 59 struct buffer_head *bh) 60 60 { ··· 74 74 } else 75 75 calculated &= 0xFFFF; 76 76 77 - if (provided == calculated) 78 - return 1; 79 - 80 - return 0; 77 + return provided == calculated; 81 78 } 82 79 83 - void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, 80 + void ext4_block_bitmap_csum_set(struct super_block *sb, 84 81 struct ext4_group_desc *gdp, 85 82 struct buffer_head *bh) 86 83 {
+11 -103
fs/ext4/ext4.h
··· 40 40 #ifdef __KERNEL__ 41 41 #include <linux/compat.h> 42 42 #endif 43 + #include <uapi/linux/ext4.h> 43 44 44 45 #include <linux/fscrypt.h> 45 46 #include <linux/fsverity.h> ··· 592 591 CHECK_FLAG_VALUE(RESERVED); 593 592 } 594 593 595 - /* Used to pass group descriptor data when online resize is done */ 596 - struct ext4_new_group_input { 597 - __u32 group; /* Group number for this data */ 598 - __u64 block_bitmap; /* Absolute block number of block bitmap */ 599 - __u64 inode_bitmap; /* Absolute block number of inode bitmap */ 600 - __u64 inode_table; /* Absolute block number of inode table start */ 601 - __u32 blocks_count; /* Total number of blocks in this group */ 602 - __u16 reserved_blocks; /* Number of reserved blocks in this group */ 603 - __u16 unused; 604 - }; 605 - 606 594 #if defined(__KERNEL__) && defined(CONFIG_COMPAT) 607 595 struct compat_ext4_new_group_input { 608 596 u32 group; ··· 688 698 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 689 699 #define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 690 700 691 - /* 692 - * ioctl commands 693 - */ 694 - #define EXT4_IOC_GETVERSION _IOR('f', 3, long) 695 - #define EXT4_IOC_SETVERSION _IOW('f', 4, long) 696 - #define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION 697 - #define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION 698 - #define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) 699 - #define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) 700 - #define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) 701 - #define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) 702 - #define EXT4_IOC_MIGRATE _IO('f', 9) 703 - /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ 704 - /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ 705 - #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) 706 - #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) 707 - #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) 708 - #define EXT4_IOC_SWAP_BOOT _IO('f', 17) 709 - #define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) 710 - /* ioctl codes 19--39 are reserved for fscrypt */ 711 - #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) 712 - #define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) 713 - #define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) 714 - #define EXT4_IOC_CHECKPOINT _IOW('f', 43, __u32) 715 - #define EXT4_IOC_GETFSUUID _IOR('f', 44, struct fsuuid) 716 - #define EXT4_IOC_SETFSUUID _IOW('f', 44, struct fsuuid) 717 - 718 - #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32) 719 - 720 - /* 721 - * Flags for going down operation 722 - */ 723 - #define EXT4_GOING_FLAGS_DEFAULT 0x0 /* going down */ 724 - #define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ 725 - #define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ 726 - 727 - /* 728 - * Flags returned by EXT4_IOC_GETSTATE 729 - * 730 - * We only expose to userspace a subset of the state flags in 731 - * i_state_flags 732 - */ 733 - #define EXT4_STATE_FLAG_EXT_PRECACHED 0x00000001 734 - #define EXT4_STATE_FLAG_NEW 0x00000002 735 - #define EXT4_STATE_FLAG_NEWENTRY 0x00000004 736 - #define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008 737 - 738 - /* flags for ioctl EXT4_IOC_CHECKPOINT */ 739 - #define EXT4_IOC_CHECKPOINT_FLAG_DISCARD 0x1 740 - #define EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT 0x2 741 - #define EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN 0x4 742 - #define EXT4_IOC_CHECKPOINT_FLAG_VALID (EXT4_IOC_CHECKPOINT_FLAG_DISCARD | \ 743 - EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT | \ 744 - EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN) 745 - 746 - /* 747 - * Structure for EXT4_IOC_GETFSUUID/EXT4_IOC_SETFSUUID 748 - */ 749 - struct fsuuid { 750 - __u32 fsu_len; 751 - __u32 fsu_flags; 752 - __u8 fsu_uuid[]; 753 - }; 754 - 755 701 #if defined(__KERNEL__) && defined(CONFIG_COMPAT) 756 702 /* 757 703 * ioctl commands in 32 bit emulation ··· 701 775 #define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION 702 776 #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION 703 777 #endif 704 - 705 - /* 706 - * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag. 707 - * It indicates that the entry in extent status cache is for a hole. 708 - */ 709 - #define EXT4_FIEMAP_EXTENT_HOLE 0x08000000 710 778 711 779 /* Max physical block we can address w/o extents */ 712 780 #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF ··· 770 850 __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ 771 851 __le32 i_version_hi; /* high 32 bits for 64-bit version */ 772 852 __le32 i_projid; /* Project ID */ 773 - }; 774 - 775 - struct move_extent { 776 - __u32 reserved; /* should be zero */ 777 - __u32 donor_fd; /* donor file descriptor */ 778 - __u64 orig_start; /* logical start offset in block for orig */ 779 - __u64 donor_start; /* logical start offset in block for donor */ 780 - __u64 len; /* block length to be moved */ 781 - __u64 moved_len; /* moved block length */ 782 853 }; 783 854 784 855 #define EXT4_EPOCH_BITS 2 ··· 1031 1120 1032 1121 /* mballoc */ 1033 1122 atomic_t i_prealloc_active; 1034 - struct list_head i_prealloc_list; 1035 - spinlock_t i_prealloc_lock; 1123 + struct rb_root i_prealloc_node; 1124 + rwlock_t i_prealloc_lock; 1036 1125 1037 1126 /* extents status tree */ 1038 1127 struct ext4_es_tree i_es_tree; ··· 1524 1613 unsigned int s_mb_stats; 1525 1614 unsigned int s_mb_order2_reqs; 1526 1615 unsigned int s_mb_group_prealloc; 1527 - unsigned int s_mb_max_inode_prealloc; 1528 1616 unsigned int s_max_dir_size_kb; 1529 1617 /* where last allocation was done - for stream allocation */ 1530 1618 unsigned long s_mb_last_group; ··· 1797 1887 * Inode dynamic state flags 1798 1888 */ 1799 1889 enum { 1800 - EXT4_STATE_JDATA, /* journaled data exists */ 1801 1890 EXT4_STATE_NEW, /* inode is newly created */ 1802 1891 EXT4_STATE_XATTR, /* has in-inode xattrs */ 1803 1892 EXT4_STATE_NO_EXPAND, /* No space for expansion */ ··· 2585 2676 2586 2677 /* bitmap.c */ 2587 2678 extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); 2588 - void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, 2679 + void ext4_inode_bitmap_csum_set(struct super_block *sb, 2589 2680 struct ext4_group_desc *gdp, 2590 2681 struct buffer_head *bh, int sz); 2591 - int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, 2682 + int ext4_inode_bitmap_csum_verify(struct super_block *sb, 2592 2683 struct ext4_group_desc *gdp, 2593 2684 struct buffer_head *bh, int sz); 2594 - void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, 2685 + void ext4_block_bitmap_csum_set(struct super_block *sb, 2595 2686 struct ext4_group_desc *gdp, 2596 2687 struct buffer_head *bh); 2597 - int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, 2688 + int ext4_block_bitmap_csum_verify(struct super_block *sb, 2598 2689 struct ext4_group_desc *gdp, 2599 2690 struct buffer_head *bh); 2600 2691 ··· 3459 3550 unsigned int len); 3460 3551 extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); 3461 3552 3462 - extern int ext4_readpage_inline(struct inode *inode, struct page *page); 3553 + int ext4_readpage_inline(struct inode *inode, struct folio *folio); 3463 3554 extern int ext4_try_to_write_inline_data(struct address_space *mapping, 3464 3555 struct inode *inode, 3465 3556 loff_t pos, unsigned len, ··· 3556 3647 3557 3648 /* readpages.c */ 3558 3649 extern int ext4_mpage_readpages(struct inode *inode, 3559 - struct readahead_control *rac, struct page *page); 3650 + struct readahead_control *rac, struct folio *folio); 3560 3651 extern int __init ext4_init_post_read_processing(void); 3561 3652 extern void ext4_exit_post_read_processing(void); 3562 3653 ··· 3666 3757 struct writeback_control *wbc); 3667 3758 extern void ext4_end_io_rsv_work(struct work_struct *work); 3668 3759 extern void ext4_io_submit(struct ext4_io_submit *io); 3669 - extern int ext4_bio_write_page(struct ext4_io_submit *io, 3670 - struct page *page, 3671 - int len); 3760 + int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page, 3761 + size_t len); 3672 3762 extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); 3673 3763 extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); 3674 3764
+14 -21
fs/ext4/extents.c
··· 4526 4526 4527 4527 trace_ext4_zero_range(inode, offset, len, mode); 4528 4528 4529 - /* Call ext4_force_commit to flush all data in case of data=journal. */ 4530 - if (ext4_should_journal_data(inode)) { 4531 - ret = ext4_force_commit(inode->i_sb); 4532 - if (ret) 4533 - return ret; 4534 - } 4535 - 4536 4529 /* 4537 4530 * Round up offset. This is not fallocate, we need to zero out 4538 4531 * blocks, so convert interior block aligned part of the range to ··· 4609 4616 filemap_invalidate_unlock(mapping); 4610 4617 goto out_mutex; 4611 4618 } 4619 + 4620 + /* 4621 + * For journalled data we need to write (and checkpoint) pages 4622 + * before discarding page cache to avoid inconsitent data on 4623 + * disk in case of crash before zeroing trans is committed. 4624 + */ 4625 + if (ext4_should_journal_data(inode)) { 4626 + ret = filemap_write_and_wait_range(mapping, start, end); 4627 + if (ret) { 4628 + filemap_invalidate_unlock(mapping); 4629 + goto out_mutex; 4630 + } 4631 + } 4632 + 4612 4633 /* Now release the pages and zero block aligned part of pages */ 4613 4634 truncate_pagecache_range(inode, start, end - 1); 4614 4635 inode->i_mtime = inode->i_ctime = current_time(inode); ··· 5297 5290 punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); 5298 5291 punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); 5299 5292 5300 - /* Call ext4_force_commit to flush all data in case of data=journal. */ 5301 - if (ext4_should_journal_data(inode)) { 5302 - ret = ext4_force_commit(inode->i_sb); 5303 - if (ret) 5304 - return ret; 5305 - } 5306 - 5307 5293 inode_lock(inode); 5308 5294 /* 5309 5295 * There is no need to overlap collapse range with EOF, in which case ··· 5442 5442 5443 5443 offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); 5444 5444 len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); 5445 - 5446 - /* Call ext4_force_commit to flush all data in case of data=journal */ 5447 - if (ext4_should_journal_data(inode)) { 5448 - ret = ext4_force_commit(inode->i_sb); 5449 - if (ret) 5450 - return ret; 5451 - } 5452 5445 5453 5446 inode_lock(inode); 5454 5447 /* Currently just for extent based files */
-11
fs/ext4/fsync.c
··· 153 153 goto out; 154 154 155 155 /* 156 - * data=writeback,ordered: 157 156 * The caller's filemap_fdatawrite()/wait will sync the data. 158 157 * Metadata is in the journal, we wait for proper transaction to 159 158 * commit here. 160 - * 161 - * data=journal: 162 - * filemap_fdatawrite won't do anything (the buffers are clean). 163 - * ext4_force_commit will write the file data into the journal and 164 - * will wait on that. 165 - * filemap_fdatawait() will encounter a ton of newly-dirtied pages 166 - * (they were dirtied by commit). But that's OK - the blocks are 167 - * safe in-journal, which is all fsync() needs to ensure. 168 159 */ 169 160 if (!sbi->s_journal) 170 161 ret = ext4_fsync_nojournal(inode, datasync, &needs_barrier); 171 - else if (ext4_should_journal_data(inode)) 172 - ret = ext4_force_commit(inode->i_sb); 173 162 else 174 163 ret = ext4_fsync_journal(inode, datasync, &needs_barrier); 175 164
+6 -8
fs/ext4/ialloc.c
··· 98 98 if (buffer_verified(bh)) 99 99 goto verified; 100 100 blk = ext4_inode_bitmap(sb, desc); 101 - if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh, 101 + if (!ext4_inode_bitmap_csum_verify(sb, desc, bh, 102 102 EXT4_INODES_PER_GROUP(sb) / 8) || 103 103 ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { 104 104 ext4_unlock_group(sb, block_group); ··· 327 327 if (percpu_counter_initialized(&sbi->s_dirs_counter)) 328 328 percpu_counter_dec(&sbi->s_dirs_counter); 329 329 } 330 - ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh, 330 + ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh, 331 331 EXT4_INODES_PER_GROUP(sb) / 8); 332 332 ext4_group_desc_csum_set(sb, block_group, gdp); 333 333 ext4_unlock_group(sb, block_group); ··· 813 813 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 814 814 ext4_free_group_clusters_set(sb, gdp, 815 815 ext4_free_clusters_after_init(sb, group, gdp)); 816 - ext4_block_bitmap_csum_set(sb, group, gdp, 817 - block_bitmap_bh); 816 + ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh); 818 817 ext4_group_desc_csum_set(sb, group, gdp); 819 818 } 820 819 ext4_unlock_group(sb, group); ··· 851 852 852 853 ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); 853 854 if (ext4_has_group_desc_csum(sb)) { 854 - ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh, 855 + ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, 855 856 EXT4_INODES_PER_GROUP(sb) / 8); 856 857 ext4_group_desc_csum_set(sb, group, gdp); 857 858 } ··· 1164 1165 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 1165 1166 ext4_free_group_clusters_set(sb, gdp, 1166 1167 ext4_free_clusters_after_init(sb, group, gdp)); 1167 - ext4_block_bitmap_csum_set(sb, group, gdp, 1168 - block_bitmap_bh); 1168 + ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh); 1169 1169 ext4_group_desc_csum_set(sb, group, gdp); 1170 1170 } 1171 1171 ext4_unlock_group(sb, group); ··· 1220 1222 } 1221 1223 } 1222 1224 if (ext4_has_group_desc_csum(sb)) { 1223 - ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh, 1225 + ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, 1224 1226 EXT4_INODES_PER_GROUP(sb) / 8); 1225 1227 ext4_group_desc_csum_set(sb, group, gdp); 1226 1228 }
+84 -87
fs/ext4/inline.c
··· 467 467 return error; 468 468 } 469 469 470 - static int ext4_read_inline_page(struct inode *inode, struct page *page) 470 + static int ext4_read_inline_folio(struct inode *inode, struct folio *folio) 471 471 { 472 472 void *kaddr; 473 473 int ret = 0; 474 474 size_t len; 475 475 struct ext4_iloc iloc; 476 476 477 - BUG_ON(!PageLocked(page)); 477 + BUG_ON(!folio_test_locked(folio)); 478 478 BUG_ON(!ext4_has_inline_data(inode)); 479 - BUG_ON(page->index); 479 + BUG_ON(folio->index); 480 480 481 481 if (!EXT4_I(inode)->i_inline_off) { 482 482 ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.", ··· 489 489 goto out; 490 490 491 491 len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode)); 492 - kaddr = kmap_atomic(page); 492 + BUG_ON(len > PAGE_SIZE); 493 + kaddr = kmap_local_folio(folio, 0); 493 494 ret = ext4_read_inline_data(inode, kaddr, len, &iloc); 494 - flush_dcache_page(page); 495 - kunmap_atomic(kaddr); 496 - zero_user_segment(page, len, PAGE_SIZE); 497 - SetPageUptodate(page); 495 + flush_dcache_folio(folio); 496 + kunmap_local(kaddr); 497 + folio_zero_segment(folio, len, folio_size(folio)); 498 + folio_mark_uptodate(folio); 498 499 brelse(iloc.bh); 499 500 500 501 out: 501 502 return ret; 502 503 } 503 504 504 - int ext4_readpage_inline(struct inode *inode, struct page *page) 505 + int ext4_readpage_inline(struct inode *inode, struct folio *folio) 505 506 { 506 507 int ret = 0; 507 508 ··· 516 515 * Current inline data can only exist in the 1st page, 517 516 * So for all the other pages, just set them uptodate. 518 517 */ 519 - if (!page->index) 520 - ret = ext4_read_inline_page(inode, page); 521 - else if (!PageUptodate(page)) { 522 - zero_user_segment(page, 0, PAGE_SIZE); 523 - SetPageUptodate(page); 518 + if (!folio->index) 519 + ret = ext4_read_inline_folio(inode, folio); 520 + else if (!folio_test_uptodate(folio)) { 521 + folio_zero_segment(folio, 0, folio_size(folio)); 522 + folio_mark_uptodate(folio); 524 523 } 525 524 526 525 up_read(&EXT4_I(inode)->xattr_sem); 527 526 528 - unlock_page(page); 527 + folio_unlock(folio); 529 528 return ret >= 0 ? 0 : ret; 530 529 } 531 530 ··· 535 534 int ret, needed_blocks, no_expand; 536 535 handle_t *handle = NULL; 537 536 int retries = 0, sem_held = 0; 538 - struct page *page = NULL; 539 - unsigned int flags; 537 + struct folio *folio = NULL; 540 538 unsigned from, to; 541 539 struct ext4_iloc iloc; 542 540 ··· 564 564 565 565 /* We cannot recurse into the filesystem as the transaction is already 566 566 * started */ 567 - flags = memalloc_nofs_save(); 568 - page = grab_cache_page_write_begin(mapping, 0); 569 - memalloc_nofs_restore(flags); 570 - if (!page) { 567 + folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, 568 + mapping_gfp_mask(mapping)); 569 + if (!folio) { 571 570 ret = -ENOMEM; 572 571 goto out; 573 572 } ··· 581 582 582 583 from = 0; 583 584 to = ext4_get_inline_size(inode); 584 - if (!PageUptodate(page)) { 585 - ret = ext4_read_inline_page(inode, page); 585 + if (!folio_test_uptodate(folio)) { 586 + ret = ext4_read_inline_folio(inode, folio); 586 587 if (ret < 0) 587 588 goto out; 588 589 } ··· 592 593 goto out; 593 594 594 595 if (ext4_should_dioread_nolock(inode)) { 595 - ret = __block_write_begin(page, from, to, 596 + ret = __block_write_begin(&folio->page, from, to, 596 597 ext4_get_block_unwritten); 597 598 } else 598 - ret = __block_write_begin(page, from, to, ext4_get_block); 599 + ret = __block_write_begin(&folio->page, from, to, ext4_get_block); 599 600 600 601 if (!ret && ext4_should_journal_data(inode)) { 601 - ret = ext4_walk_page_buffers(handle, inode, page_buffers(page), 602 - from, to, NULL, 603 - do_journal_get_write_access); 602 + ret = ext4_walk_page_buffers(handle, inode, 603 + folio_buffers(folio), from, to, 604 + NULL, do_journal_get_write_access); 604 605 } 605 606 606 607 if (ret) { 607 - unlock_page(page); 608 - put_page(page); 609 - page = NULL; 608 + folio_unlock(folio); 609 + folio_put(folio); 610 + folio = NULL; 610 611 ext4_orphan_add(handle, inode); 611 612 ext4_write_unlock_xattr(inode, &no_expand); 612 613 sem_held = 0; ··· 626 627 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 627 628 goto retry; 628 629 629 - if (page) 630 - block_commit_write(page, from, to); 630 + if (folio) 631 + block_commit_write(&folio->page, from, to); 631 632 out: 632 - if (page) { 633 - unlock_page(page); 634 - put_page(page); 633 + if (folio) { 634 + folio_unlock(folio); 635 + folio_put(folio); 635 636 } 636 637 if (sem_held) 637 638 ext4_write_unlock_xattr(inode, &no_expand); ··· 654 655 { 655 656 int ret; 656 657 handle_t *handle; 657 - unsigned int flags; 658 - struct page *page; 658 + struct folio *folio; 659 659 struct ext4_iloc iloc; 660 660 661 661 if (pos + len > ext4_get_max_inline_size(inode)) ··· 691 693 if (ret) 692 694 goto out; 693 695 694 - flags = memalloc_nofs_save(); 695 - page = grab_cache_page_write_begin(mapping, 0); 696 - memalloc_nofs_restore(flags); 697 - if (!page) { 696 + folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, 697 + mapping_gfp_mask(mapping)); 698 + if (!folio) { 698 699 ret = -ENOMEM; 699 700 goto out; 700 701 } 701 702 702 - *pagep = page; 703 + *pagep = &folio->page; 703 704 down_read(&EXT4_I(inode)->xattr_sem); 704 705 if (!ext4_has_inline_data(inode)) { 705 706 ret = 0; 706 - unlock_page(page); 707 - put_page(page); 707 + folio_unlock(folio); 708 + folio_put(folio); 708 709 goto out_up_read; 709 710 } 710 711 711 - if (!PageUptodate(page)) { 712 - ret = ext4_read_inline_page(inode, page); 712 + if (!folio_test_uptodate(folio)) { 713 + ret = ext4_read_inline_folio(inode, folio); 713 714 if (ret < 0) { 714 - unlock_page(page); 715 - put_page(page); 715 + folio_unlock(folio); 716 + folio_put(folio); 716 717 goto out_up_read; 717 718 } 718 719 } ··· 732 735 int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, 733 736 unsigned copied, struct page *page) 734 737 { 738 + struct folio *folio = page_folio(page); 735 739 handle_t *handle = ext4_journal_current_handle(); 736 740 int no_expand; 737 741 void *kaddr; 738 742 struct ext4_iloc iloc; 739 743 int ret = 0, ret2; 740 744 741 - if (unlikely(copied < len) && !PageUptodate(page)) 745 + if (unlikely(copied < len) && !folio_test_uptodate(folio)) 742 746 copied = 0; 743 747 744 748 if (likely(copied)) { 745 749 ret = ext4_get_inode_loc(inode, &iloc); 746 750 if (ret) { 747 - unlock_page(page); 748 - put_page(page); 751 + folio_unlock(folio); 752 + folio_put(folio); 749 753 ext4_std_error(inode->i_sb, ret); 750 754 goto out; 751 755 } ··· 760 762 */ 761 763 (void) ext4_find_inline_data_nolock(inode); 762 764 763 - kaddr = kmap_atomic(page); 765 + kaddr = kmap_local_folio(folio, 0); 764 766 ext4_write_inline_data(inode, &iloc, kaddr, pos, copied); 765 - kunmap_atomic(kaddr); 766 - SetPageUptodate(page); 767 - /* clear page dirty so that writepages wouldn't work for us. */ 768 - ClearPageDirty(page); 767 + kunmap_local(kaddr); 768 + folio_mark_uptodate(folio); 769 + /* clear dirty flag so that writepages wouldn't work for us. */ 770 + folio_clear_dirty(folio); 769 771 770 772 ext4_write_unlock_xattr(inode, &no_expand); 771 773 brelse(iloc.bh); 772 774 773 775 /* 774 - * It's important to update i_size while still holding page 776 + * It's important to update i_size while still holding folio 775 777 * lock: page writeout could otherwise come in and zero 776 778 * beyond i_size. 777 779 */ 778 780 ext4_update_inode_size(inode, pos + copied); 779 781 } 780 - unlock_page(page); 781 - put_page(page); 782 + folio_unlock(folio); 783 + folio_put(folio); 782 784 783 785 /* 784 - * Don't mark the inode dirty under page lock. First, it unnecessarily 785 - * makes the holding time of page lock longer. Second, it forces lock 786 - * ordering of page lock and transaction start for journaling 786 + * Don't mark the inode dirty under folio lock. First, it unnecessarily 787 + * makes the holding time of folio lock longer. Second, it forces lock 788 + * ordering of folio lock and transaction start for journaling 787 789 * filesystems. 788 790 */ 789 791 if (likely(copied)) ··· 850 852 void **fsdata) 851 853 { 852 854 int ret = 0, inline_size; 853 - struct page *page; 855 + struct folio *folio; 854 856 855 - page = grab_cache_page_write_begin(mapping, 0); 856 - if (!page) 857 + folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN, 858 + mapping_gfp_mask(mapping)); 859 + if (!folio) 857 860 return -ENOMEM; 858 861 859 862 down_read(&EXT4_I(inode)->xattr_sem); ··· 865 866 866 867 inline_size = ext4_get_inline_size(inode); 867 868 868 - if (!PageUptodate(page)) { 869 - ret = ext4_read_inline_page(inode, page); 869 + if (!folio_test_uptodate(folio)) { 870 + ret = ext4_read_inline_folio(inode, folio); 870 871 if (ret < 0) 871 872 goto out; 872 873 } 873 874 874 - ret = __block_write_begin(page, 0, inline_size, 875 + ret = __block_write_begin(&folio->page, 0, inline_size, 875 876 ext4_da_get_block_prep); 876 877 if (ret) { 877 878 up_read(&EXT4_I(inode)->xattr_sem); 878 - unlock_page(page); 879 - put_page(page); 879 + folio_unlock(folio); 880 + folio_put(folio); 880 881 ext4_truncate_failed_write(inode); 881 882 return ret; 882 883 } 883 884 884 - SetPageDirty(page); 885 - SetPageUptodate(page); 885 + folio_mark_dirty(folio); 886 + folio_mark_uptodate(folio); 886 887 ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 887 888 *fsdata = (void *)CONVERT_INLINE_DATA; 888 889 889 890 out: 890 891 up_read(&EXT4_I(inode)->xattr_sem); 891 - if (page) { 892 - unlock_page(page); 893 - put_page(page); 892 + if (folio) { 893 + folio_unlock(folio); 894 + folio_put(folio); 894 895 } 895 896 return ret; 896 897 } ··· 911 912 { 912 913 int ret; 913 914 handle_t *handle; 914 - struct page *page; 915 + struct folio *folio; 915 916 struct ext4_iloc iloc; 916 917 int retries = 0; 917 - unsigned int flags; 918 918 919 919 ret = ext4_get_inode_loc(inode, &iloc); 920 920 if (ret) ··· 945 947 * We cannot recurse into the filesystem as the transaction 946 948 * is already started. 947 949 */ 948 - flags = memalloc_nofs_save(); 949 - page = grab_cache_page_write_begin(mapping, 0); 950 - memalloc_nofs_restore(flags); 951 - if (!page) { 950 + folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, 951 + mapping_gfp_mask(mapping)); 952 + if (!folio) { 952 953 ret = -ENOMEM; 953 954 goto out_journal; 954 955 } ··· 958 961 goto out_release_page; 959 962 } 960 963 961 - if (!PageUptodate(page)) { 962 - ret = ext4_read_inline_page(inode, page); 964 + if (!folio_test_uptodate(folio)) { 965 + ret = ext4_read_inline_folio(inode, folio); 963 966 if (ret < 0) 964 967 goto out_release_page; 965 968 } ··· 969 972 goto out_release_page; 970 973 971 974 up_read(&EXT4_I(inode)->xattr_sem); 972 - *pagep = page; 975 + *pagep = &folio->page; 973 976 brelse(iloc.bh); 974 977 return 1; 975 978 out_release_page: 976 979 up_read(&EXT4_I(inode)->xattr_sem); 977 - unlock_page(page); 978 - put_page(page); 980 + folio_unlock(folio); 981 + folio_put(folio); 979 982 out_journal: 980 983 ext4_journal_stop(handle); 981 984 out:
+302 -508
fs/ext4/inode.c
··· 136 136 new_size); 137 137 } 138 138 139 - static int __ext4_journalled_writepage(struct page *page, unsigned int len); 140 139 static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, 141 140 int pextents); 142 141 ··· 179 180 if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) 180 181 ext4_evict_ea_inode(inode); 181 182 if (inode->i_nlink) { 182 - /* 183 - * When journalling data dirty buffers are tracked only in the 184 - * journal. So although mm thinks everything is clean and 185 - * ready for reaping the inode might still have some pages to 186 - * write in the running transaction or waiting to be 187 - * checkpointed. Thus calling jbd2_journal_invalidate_folio() 188 - * (via truncate_inode_pages()) to discard these buffers can 189 - * cause data loss. Also even if we did not discard these 190 - * buffers, we would have no way to find them after the inode 191 - * is reaped and thus user could see stale data if he tries to 192 - * read them before the transaction is checkpointed. So be 193 - * careful and force everything to disk here... We use 194 - * ei->i_datasync_tid to store the newest transaction 195 - * containing inode's data. 196 - * 197 - * Note that directories do not have this problem because they 198 - * don't use page cache. 199 - */ 200 - if (inode->i_ino != EXT4_JOURNAL_INO && 201 - ext4_should_journal_data(inode) && 202 - S_ISREG(inode->i_mode) && inode->i_data.nrpages) { 203 - journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 204 - tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; 205 - 206 - jbd2_complete_transaction(journal, commit_tid); 207 - filemap_write_and_wait(&inode->i_data); 208 - } 209 183 truncate_inode_pages_final(&inode->i_data); 210 184 211 185 goto no_delete; ··· 977 1005 } 978 1006 979 1007 /* 980 - * To preserve ordering, it is essential that the hole instantiation and 981 - * the data write be encapsulated in a single transaction. We cannot 982 - * close off a transaction and start a new one between the ext4_get_block() 983 - * and the commit_write(). So doing the jbd2_journal_start at the start of 984 - * prepare_write() is the right place. 985 - * 986 - * Also, this function can nest inside ext4_writepage(). In that case, we 987 - * *know* that ext4_writepage() has generated enough buffer credits to do the 988 - * whole page. So we won't block on the journal in that case, which is good, 989 - * because the caller may be PF_MEMALLOC. 990 - * 991 - * By accident, ext4 can be reentered when a transaction is open via 992 - * quota file writes. If we were to commit the transaction while thus 993 - * reentered, there can be a deadlock - we would be holding a quota 994 - * lock, and the commit would never complete if another thread had a 995 - * transaction open and was blocking on the quota lock - a ranking 996 - * violation. 997 - * 998 - * So what we do is to rely on the fact that jbd2_journal_stop/journal_start 999 - * will _not_ run commit under these circumstances because handle->h_ref 1000 - * is elevated. We'll still have enough credits for the tiny quotafile 1001 - * write. 1008 + * Helper for handling dirtying of journalled data. We also mark the folio as 1009 + * dirty so that writeback code knows about this page (and inode) contains 1010 + * dirty data. ext4_writepages() then commits appropriate transaction to 1011 + * make data stable. 1002 1012 */ 1013 + static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) 1014 + { 1015 + folio_mark_dirty(bh->b_folio); 1016 + return ext4_handle_dirty_metadata(handle, NULL, bh); 1017 + } 1018 + 1003 1019 int do_journal_get_write_access(handle_t *handle, struct inode *inode, 1004 1020 struct buffer_head *bh) 1005 1021 { ··· 1010 1050 ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, 1011 1051 EXT4_JTR_NONE); 1012 1052 if (!ret && dirty) 1013 - ret = ext4_handle_dirty_metadata(handle, NULL, bh); 1053 + ret = ext4_dirty_journalled_data(handle, bh); 1014 1054 return ret; 1015 1055 } 1016 1056 1017 1057 #ifdef CONFIG_FS_ENCRYPTION 1018 - static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, 1058 + static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, 1019 1059 get_block_t *get_block) 1020 1060 { 1021 1061 unsigned from = pos & (PAGE_SIZE - 1); 1022 1062 unsigned to = from + len; 1023 - struct inode *inode = page->mapping->host; 1063 + struct inode *inode = folio->mapping->host; 1024 1064 unsigned block_start, block_end; 1025 1065 sector_t block; 1026 1066 int err = 0; ··· 1030 1070 int nr_wait = 0; 1031 1071 int i; 1032 1072 1033 - BUG_ON(!PageLocked(page)); 1073 + BUG_ON(!folio_test_locked(folio)); 1034 1074 BUG_ON(from > PAGE_SIZE); 1035 1075 BUG_ON(to > PAGE_SIZE); 1036 1076 BUG_ON(from > to); 1037 1077 1038 - if (!page_has_buffers(page)) 1039 - create_empty_buffers(page, blocksize, 0); 1040 - head = page_buffers(page); 1078 + head = folio_buffers(folio); 1079 + if (!head) { 1080 + create_empty_buffers(&folio->page, blocksize, 0); 1081 + head = folio_buffers(folio); 1082 + } 1041 1083 bbits = ilog2(blocksize); 1042 - block = (sector_t)page->index << (PAGE_SHIFT - bbits); 1084 + block = (sector_t)folio->index << (PAGE_SHIFT - bbits); 1043 1085 1044 1086 for (bh = head, block_start = 0; bh != head || !block_start; 1045 1087 block++, block_start = block_end, bh = bh->b_this_page) { 1046 1088 block_end = block_start + blocksize; 1047 1089 if (block_end <= from || block_start >= to) { 1048 - if (PageUptodate(page)) { 1090 + if (folio_test_uptodate(folio)) { 1049 1091 set_buffer_uptodate(bh); 1050 1092 } 1051 1093 continue; ··· 1060 1098 if (err) 1061 1099 break; 1062 1100 if (buffer_new(bh)) { 1063 - if (PageUptodate(page)) { 1101 + if (folio_test_uptodate(folio)) { 1064 1102 clear_buffer_new(bh); 1065 1103 set_buffer_uptodate(bh); 1066 1104 mark_buffer_dirty(bh); 1067 1105 continue; 1068 1106 } 1069 1107 if (block_end > to || block_start < from) 1070 - zero_user_segments(page, to, block_end, 1071 - block_start, from); 1108 + folio_zero_segments(folio, to, 1109 + block_end, 1110 + block_start, from); 1072 1111 continue; 1073 1112 } 1074 1113 } 1075 - if (PageUptodate(page)) { 1114 + if (folio_test_uptodate(folio)) { 1076 1115 set_buffer_uptodate(bh); 1077 1116 continue; 1078 1117 } ··· 1093 1130 err = -EIO; 1094 1131 } 1095 1132 if (unlikely(err)) { 1096 - page_zero_new_buffers(page, from, to); 1133 + page_zero_new_buffers(&folio->page, from, to); 1097 1134 } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { 1098 1135 for (i = 0; i < nr_wait; i++) { 1099 1136 int err2; 1100 1137 1101 - err2 = fscrypt_decrypt_pagecache_blocks(page_folio(page), 1102 - blocksize, 1103 - bh_offset(wait[i])); 1138 + err2 = fscrypt_decrypt_pagecache_blocks(folio, 1139 + blocksize, bh_offset(wait[i])); 1104 1140 if (err2) { 1105 1141 clear_buffer_uptodate(wait[i]); 1106 1142 err = err2; ··· 1111 1149 } 1112 1150 #endif 1113 1151 1152 + /* 1153 + * To preserve ordering, it is essential that the hole instantiation and 1154 + * the data write be encapsulated in a single transaction. We cannot 1155 + * close off a transaction and start a new one between the ext4_get_block() 1156 + * and the ext4_write_end(). So doing the jbd2_journal_start at the start of 1157 + * ext4_write_begin() is the right place. 1158 + */ 1114 1159 static int ext4_write_begin(struct file *file, struct address_space *mapping, 1115 1160 loff_t pos, unsigned len, 1116 1161 struct page **pagep, void **fsdata) ··· 1126 1157 int ret, needed_blocks; 1127 1158 handle_t *handle; 1128 1159 int retries = 0; 1129 - struct page *page; 1160 + struct folio *folio; 1130 1161 pgoff_t index; 1131 1162 unsigned from, to; 1132 1163 ··· 1153 1184 } 1154 1185 1155 1186 /* 1156 - * grab_cache_page_write_begin() can take a long time if the 1157 - * system is thrashing due to memory pressure, or if the page 1187 + * __filemap_get_folio() can take a long time if the 1188 + * system is thrashing due to memory pressure, or if the folio 1158 1189 * is being written back. So grab it first before we start 1159 1190 * the transaction handle. This also allows us to allocate 1160 - * the page (if needed) without using GFP_NOFS. 1191 + * the folio (if needed) without using GFP_NOFS. 1161 1192 */ 1162 1193 retry_grab: 1163 - page = grab_cache_page_write_begin(mapping, index); 1164 - if (!page) 1194 + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, 1195 + mapping_gfp_mask(mapping)); 1196 + if (!folio) 1165 1197 return -ENOMEM; 1166 1198 /* 1167 1199 * The same as page allocation, we prealloc buffer heads before 1168 1200 * starting the handle. 1169 1201 */ 1170 - if (!page_has_buffers(page)) 1171 - create_empty_buffers(page, inode->i_sb->s_blocksize, 0); 1202 + if (!folio_buffers(folio)) 1203 + create_empty_buffers(&folio->page, inode->i_sb->s_blocksize, 0); 1172 1204 1173 - unlock_page(page); 1205 + folio_unlock(folio); 1174 1206 1175 1207 retry_journal: 1176 1208 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); 1177 1209 if (IS_ERR(handle)) { 1178 - put_page(page); 1210 + folio_put(folio); 1179 1211 return PTR_ERR(handle); 1180 1212 } 1181 1213 1182 - lock_page(page); 1183 - if (page->mapping != mapping) { 1184 - /* The page got truncated from under us */ 1185 - unlock_page(page); 1186 - put_page(page); 1214 + folio_lock(folio); 1215 + if (folio->mapping != mapping) { 1216 + /* The folio got truncated from under us */ 1217 + folio_unlock(folio); 1218 + folio_put(folio); 1187 1219 ext4_journal_stop(handle); 1188 1220 goto retry_grab; 1189 1221 } 1190 - /* In case writeback began while the page was unlocked */ 1191 - wait_for_stable_page(page); 1222 + /* In case writeback began while the folio was unlocked */ 1223 + folio_wait_stable(folio); 1192 1224 1193 1225 #ifdef CONFIG_FS_ENCRYPTION 1194 1226 if (ext4_should_dioread_nolock(inode)) 1195 - ret = ext4_block_write_begin(page, pos, len, 1227 + ret = ext4_block_write_begin(folio, pos, len, 1196 1228 ext4_get_block_unwritten); 1197 1229 else 1198 - ret = ext4_block_write_begin(page, pos, len, 1199 - ext4_get_block); 1230 + ret = ext4_block_write_begin(folio, pos, len, ext4_get_block); 1200 1231 #else 1201 1232 if (ext4_should_dioread_nolock(inode)) 1202 - ret = __block_write_begin(page, pos, len, 1233 + ret = __block_write_begin(&folio->page, pos, len, 1203 1234 ext4_get_block_unwritten); 1204 1235 else 1205 - ret = __block_write_begin(page, pos, len, ext4_get_block); 1236 + ret = __block_write_begin(&folio->page, pos, len, ext4_get_block); 1206 1237 #endif 1207 1238 if (!ret && ext4_should_journal_data(inode)) { 1208 1239 ret = ext4_walk_page_buffers(handle, inode, 1209 - page_buffers(page), from, to, NULL, 1210 - do_journal_get_write_access); 1240 + folio_buffers(folio), from, to, 1241 + NULL, do_journal_get_write_access); 1211 1242 } 1212 1243 1213 1244 if (ret) { 1214 1245 bool extended = (pos + len > inode->i_size) && 1215 1246 !ext4_verity_in_progress(inode); 1216 1247 1217 - unlock_page(page); 1248 + folio_unlock(folio); 1218 1249 /* 1219 1250 * __block_write_begin may have instantiated a few blocks 1220 1251 * outside i_size. Trim these off again. Don't need ··· 1242 1273 if (ret == -ENOSPC && 1243 1274 ext4_should_retry_alloc(inode->i_sb, &retries)) 1244 1275 goto retry_journal; 1245 - put_page(page); 1276 + folio_put(folio); 1246 1277 return ret; 1247 1278 } 1248 - *pagep = page; 1279 + *pagep = &folio->page; 1249 1280 return ret; 1250 1281 } 1251 1282 ··· 1257 1288 if (!buffer_mapped(bh) || buffer_freed(bh)) 1258 1289 return 0; 1259 1290 set_buffer_uptodate(bh); 1260 - ret = ext4_handle_dirty_metadata(handle, NULL, bh); 1291 + ret = ext4_dirty_journalled_data(handle, bh); 1261 1292 clear_buffer_meta(bh); 1262 1293 clear_buffer_prio(bh); 1263 1294 return ret; ··· 1275 1306 loff_t pos, unsigned len, unsigned copied, 1276 1307 struct page *page, void *fsdata) 1277 1308 { 1309 + struct folio *folio = page_folio(page); 1278 1310 handle_t *handle = ext4_journal_current_handle(); 1279 1311 struct inode *inode = mapping->host; 1280 1312 loff_t old_size = inode->i_size; ··· 1291 1321 1292 1322 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 1293 1323 /* 1294 - * it's important to update i_size while still holding page lock: 1324 + * it's important to update i_size while still holding folio lock: 1295 1325 * page writeout could otherwise come in and zero beyond i_size. 1296 1326 * 1297 1327 * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree ··· 1299 1329 */ 1300 1330 if (!verity) 1301 1331 i_size_changed = ext4_update_inode_size(inode, pos + copied); 1302 - unlock_page(page); 1303 - put_page(page); 1332 + folio_unlock(folio); 1333 + folio_put(folio); 1304 1334 1305 1335 if (old_size < pos && !verity) 1306 1336 pagecache_isize_extended(inode, old_size, pos); 1307 1337 /* 1308 - * Don't mark the inode dirty under page lock. First, it unnecessarily 1309 - * makes the holding time of page lock longer. Second, it forces lock 1310 - * ordering of page lock and transaction start for journaling 1338 + * Don't mark the inode dirty under folio lock. First, it unnecessarily 1339 + * makes the holding time of folio lock longer. Second, it forces lock 1340 + * ordering of folio lock and transaction start for journaling 1311 1341 * filesystems. 1312 1342 */ 1313 1343 if (i_size_changed) ··· 1341 1371 /* 1342 1372 * This is a private version of page_zero_new_buffers() which doesn't 1343 1373 * set the buffer to be dirty, since in data=journalled mode we need 1344 - * to call ext4_handle_dirty_metadata() instead. 1374 + * to call ext4_dirty_journalled_data() instead. 1345 1375 */ 1346 1376 static void ext4_journalled_zero_new_buffers(handle_t *handle, 1347 1377 struct inode *inode, 1348 - struct page *page, 1378 + struct folio *folio, 1349 1379 unsigned from, unsigned to) 1350 1380 { 1351 1381 unsigned int block_start = 0, block_end; 1352 1382 struct buffer_head *head, *bh; 1353 1383 1354 - bh = head = page_buffers(page); 1384 + bh = head = folio_buffers(folio); 1355 1385 do { 1356 1386 block_end = block_start + bh->b_size; 1357 1387 if (buffer_new(bh)) { 1358 1388 if (block_end > from && block_start < to) { 1359 - if (!PageUptodate(page)) { 1389 + if (!folio_test_uptodate(folio)) { 1360 1390 unsigned start, size; 1361 1391 1362 1392 start = max(from, block_start); 1363 1393 size = min(to, block_end) - start; 1364 1394 1365 - zero_user(page, start, size); 1395 + folio_zero_range(folio, start, size); 1366 1396 write_end_fn(handle, inode, bh); 1367 1397 } 1368 1398 clear_buffer_new(bh); ··· 1378 1408 loff_t pos, unsigned len, unsigned copied, 1379 1409 struct page *page, void *fsdata) 1380 1410 { 1411 + struct folio *folio = page_folio(page); 1381 1412 handle_t *handle = ext4_journal_current_handle(); 1382 1413 struct inode *inode = mapping->host; 1383 1414 loff_t old_size = inode->i_size; ··· 1397 1426 if (ext4_has_inline_data(inode)) 1398 1427 return ext4_write_inline_data_end(inode, pos, len, copied, page); 1399 1428 1400 - if (unlikely(copied < len) && !PageUptodate(page)) { 1429 + if (unlikely(copied < len) && !folio_test_uptodate(folio)) { 1401 1430 copied = 0; 1402 - ext4_journalled_zero_new_buffers(handle, inode, page, from, to); 1431 + ext4_journalled_zero_new_buffers(handle, inode, folio, 1432 + from, to); 1403 1433 } else { 1404 1434 if (unlikely(copied < len)) 1405 - ext4_journalled_zero_new_buffers(handle, inode, page, 1435 + ext4_journalled_zero_new_buffers(handle, inode, folio, 1406 1436 from + copied, to); 1407 - ret = ext4_walk_page_buffers(handle, inode, page_buffers(page), 1437 + ret = ext4_walk_page_buffers(handle, inode, 1438 + folio_buffers(folio), 1408 1439 from, from + copied, &partial, 1409 1440 write_end_fn); 1410 1441 if (!partial) 1411 - SetPageUptodate(page); 1442 + folio_mark_uptodate(folio); 1412 1443 } 1413 1444 if (!verity) 1414 1445 size_changed = ext4_update_inode_size(inode, pos + copied); 1415 - ext4_set_inode_state(inode, EXT4_STATE_JDATA); 1416 1446 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; 1417 - unlock_page(page); 1418 - put_page(page); 1447 + folio_unlock(folio); 1448 + folio_put(folio); 1419 1449 1420 1450 if (old_size < pos && !verity) 1421 1451 pagecache_isize_extended(inode, old_size, pos); ··· 1540 1568 struct ext4_io_submit io_submit; /* IO submission data */ 1541 1569 unsigned int do_map:1; 1542 1570 unsigned int scanned_until_end:1; 1571 + unsigned int journalled_more_data:1; 1543 1572 }; 1544 1573 1545 1574 static void mpage_release_unused_pages(struct mpage_da_data *mpd, ··· 1620 1647 ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", 1621 1648 ei->i_reserved_data_blocks); 1622 1649 return; 1623 - } 1624 - 1625 - static int ext4_bh_delay_or_unwritten(handle_t *handle, struct inode *inode, 1626 - struct buffer_head *bh) 1627 - { 1628 - return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); 1629 1650 } 1630 1651 1631 1652 /* ··· 1854 1887 return 0; 1855 1888 } 1856 1889 1857 - static int __ext4_journalled_writepage(struct page *page, 1858 - unsigned int len) 1890 + static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio) 1859 1891 { 1860 - struct address_space *mapping = page->mapping; 1861 - struct inode *inode = mapping->host; 1862 - handle_t *handle = NULL; 1863 - int ret = 0, err = 0; 1864 - int inline_data = ext4_has_inline_data(inode); 1865 - struct buffer_head *inode_bh = NULL; 1866 - loff_t size; 1867 - 1868 - ClearPageChecked(page); 1869 - 1870 - if (inline_data) { 1871 - BUG_ON(page->index != 0); 1872 - BUG_ON(len > ext4_get_max_inline_size(inode)); 1873 - inode_bh = ext4_journalled_write_inline_data(inode, len, page); 1874 - if (inode_bh == NULL) 1875 - goto out; 1876 - } 1877 - /* 1878 - * We need to release the page lock before we start the 1879 - * journal, so grab a reference so the page won't disappear 1880 - * out from under us. 1881 - */ 1882 - get_page(page); 1883 - unlock_page(page); 1884 - 1885 - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1886 - ext4_writepage_trans_blocks(inode)); 1887 - if (IS_ERR(handle)) { 1888 - ret = PTR_ERR(handle); 1889 - put_page(page); 1890 - goto out_no_pagelock; 1891 - } 1892 - BUG_ON(!ext4_handle_valid(handle)); 1893 - 1894 - lock_page(page); 1895 - put_page(page); 1896 - size = i_size_read(inode); 1897 - if (page->mapping != mapping || page_offset(page) > size) { 1898 - /* The page got truncated from under us */ 1899 - ext4_journal_stop(handle); 1900 - ret = 0; 1901 - goto out; 1902 - } 1903 - 1904 - if (inline_data) { 1905 - ret = ext4_mark_inode_dirty(handle, inode); 1906 - } else { 1907 - struct buffer_head *page_bufs = page_buffers(page); 1908 - 1909 - if (page->index == size >> PAGE_SHIFT) 1910 - len = size & ~PAGE_MASK; 1911 - else 1912 - len = PAGE_SIZE; 1913 - 1914 - ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, 1915 - NULL, do_journal_get_write_access); 1916 - 1917 - err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, 1918 - NULL, write_end_fn); 1919 - } 1920 - if (ret == 0) 1921 - ret = err; 1922 - err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); 1923 - if (ret == 0) 1924 - ret = err; 1925 - EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; 1926 - err = ext4_journal_stop(handle); 1927 - if (!ret) 1928 - ret = err; 1929 - 1930 - ext4_set_inode_state(inode, EXT4_STATE_JDATA); 1931 - out: 1932 - unlock_page(page); 1933 - out_no_pagelock: 1934 - brelse(inode_bh); 1935 - return ret; 1892 + mpd->first_page += folio_nr_pages(folio); 1893 + folio_unlock(folio); 1936 1894 } 1937 1895 1938 - /* 1939 - * Note that we don't need to start a transaction unless we're journaling data 1940 - * because we should have holes filled from ext4_page_mkwrite(). We even don't 1941 - * need to file the inode to the transaction's list in ordered mode because if 1942 - * we are writing back data added by write(), the inode is already there and if 1943 - * we are writing back data modified via mmap(), no one guarantees in which 1944 - * transaction the data will hit the disk. In case we are journaling data, we 1945 - * cannot start transaction directly because transaction start ranks above page 1946 - * lock so we have to do some magic. 1947 - * 1948 - * This function can get called via... 1949 - * - ext4_writepages after taking page lock (have journal handle) 1950 - * - journal_submit_inode_data_buffers (no journal handle) 1951 - * - shrink_page_list via the kswapd/direct reclaim (no journal handle) 1952 - * - grab_page_cache when doing write_begin (have journal handle) 1953 - * 1954 - * We don't do any block allocation in this function. If we have page with 1955 - * multiple blocks we need to write those buffer_heads that are mapped. This 1956 - * is important for mmaped based write. So if we do with blocksize 1K 1957 - * truncate(f, 1024); 1958 - * a = mmap(f, 0, 4096); 1959 - * a[0] = 'a'; 1960 - * truncate(f, 4096); 1961 - * we have in the page first buffer_head mapped via page_mkwrite call back 1962 - * but other buffer_heads would be unmapped but dirty (dirty done via the 1963 - * do_wp_page). So writepage should write the first block. If we modify 1964 - * the mmap area beyond 1024 we will again get a page_fault and the 1965 - * page_mkwrite callback will do the block allocation and mark the 1966 - * buffer_heads mapped. 1967 - * 1968 - * We redirty the page if we have any buffer_heads that is either delay or 1969 - * unwritten in the page. 1970 - * 1971 - * We can get recursively called as show below. 1972 - * 1973 - * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> 1974 - * ext4_writepage() 1975 - * 1976 - * But since we don't do any block allocation we should not deadlock. 1977 - * Page also have the dirty flag cleared so we don't get recurive page_lock. 1978 - */ 1979 - static int ext4_writepage(struct page *page, 1980 - struct writeback_control *wbc) 1896 + static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) 1981 1897 { 1982 - struct folio *folio = page_folio(page); 1983 - int ret = 0; 1984 - loff_t size; 1985 - unsigned int len; 1986 - struct buffer_head *page_bufs = NULL; 1987 - struct inode *inode = page->mapping->host; 1988 - struct ext4_io_submit io_submit; 1989 - 1990 - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { 1991 - folio_invalidate(folio, 0, folio_size(folio)); 1992 - folio_unlock(folio); 1993 - return -EIO; 1994 - } 1995 - 1996 - trace_ext4_writepage(page); 1997 - size = i_size_read(inode); 1998 - if (page->index == size >> PAGE_SHIFT && 1999 - !ext4_verity_in_progress(inode)) 2000 - len = size & ~PAGE_MASK; 2001 - else 2002 - len = PAGE_SIZE; 2003 - 2004 - /* Should never happen but for bugs in other kernel subsystems */ 2005 - if (!page_has_buffers(page)) { 2006 - ext4_warning_inode(inode, 2007 - "page %lu does not have buffers attached", page->index); 2008 - ClearPageDirty(page); 2009 - unlock_page(page); 2010 - return 0; 2011 - } 2012 - 2013 - page_bufs = page_buffers(page); 2014 - /* 2015 - * We cannot do block allocation or other extent handling in this 2016 - * function. If there are buffers needing that, we have to redirty 2017 - * the page. But we may reach here when we do a journal commit via 2018 - * journal_submit_inode_data_buffers() and in that case we must write 2019 - * allocated buffers to achieve data=ordered mode guarantees. 2020 - * 2021 - * Also, if there is only one buffer per page (the fs block 2022 - * size == the page size), if one buffer needs block 2023 - * allocation or needs to modify the extent tree to clear the 2024 - * unwritten flag, we know that the page can't be written at 2025 - * all, so we might as well refuse the write immediately. 2026 - * Unfortunately if the block size != page size, we can't as 2027 - * easily detect this case using ext4_walk_page_buffers(), but 2028 - * for the extremely common case, this is an optimization that 2029 - * skips a useless round trip through ext4_bio_write_page(). 2030 - */ 2031 - if (ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL, 2032 - ext4_bh_delay_or_unwritten)) { 2033 - redirty_page_for_writepage(wbc, page); 2034 - if ((current->flags & PF_MEMALLOC) || 2035 - (inode->i_sb->s_blocksize == PAGE_SIZE)) { 2036 - /* 2037 - * For memory cleaning there's no point in writing only 2038 - * some buffers. So just bail out. Warn if we came here 2039 - * from direct reclaim. 2040 - */ 2041 - WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) 2042 - == PF_MEMALLOC); 2043 - unlock_page(page); 2044 - return 0; 2045 - } 2046 - } 2047 - 2048 - if (PageChecked(page) && ext4_should_journal_data(inode)) 2049 - /* 2050 - * It's mmapped pagecache. Add buffers and journal it. There 2051 - * doesn't seem much point in redirtying the page here. 2052 - */ 2053 - return __ext4_journalled_writepage(page, len); 2054 - 2055 - ext4_io_submit_init(&io_submit, wbc); 2056 - io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); 2057 - if (!io_submit.io_end) { 2058 - redirty_page_for_writepage(wbc, page); 2059 - unlock_page(page); 2060 - return -ENOMEM; 2061 - } 2062 - ret = ext4_bio_write_page(&io_submit, page, len); 2063 - ext4_io_submit(&io_submit); 2064 - /* Drop io_end reference we got from init */ 2065 - ext4_put_io_end_defer(io_submit.io_end); 2066 - return ret; 2067 - } 2068 - 2069 - static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) 2070 - { 2071 - int len; 1898 + size_t len; 2072 1899 loff_t size; 2073 1900 int err; 2074 1901 2075 - BUG_ON(page->index != mpd->first_page); 2076 - clear_page_dirty_for_io(page); 1902 + BUG_ON(folio->index != mpd->first_page); 1903 + folio_clear_dirty_for_io(folio); 2077 1904 /* 2078 1905 * We have to be very careful here! Nothing protects writeback path 2079 1906 * against i_size changes and the page can be writeably mapped into 2080 1907 * page tables. So an application can be growing i_size and writing 2081 - * data through mmap while writeback runs. clear_page_dirty_for_io() 1908 + * data through mmap while writeback runs. folio_clear_dirty_for_io() 2082 1909 * write-protects our page in page tables and the page cannot get 2083 - * written to again until we release page lock. So only after 2084 - * clear_page_dirty_for_io() we are safe to sample i_size for 2085 - * ext4_bio_write_page() to zero-out tail of the written page. We rely 2086 - * on the barrier provided by TestClearPageDirty in 2087 - * clear_page_dirty_for_io() to make sure i_size is really sampled only 1910 + * written to again until we release folio lock. So only after 1911 + * folio_clear_dirty_for_io() we are safe to sample i_size for 1912 + * ext4_bio_write_folio() to zero-out tail of the written page. We rely 1913 + * on the barrier provided by folio_test_clear_dirty() in 1914 + * folio_clear_dirty_for_io() to make sure i_size is really sampled only 2088 1915 * after page tables are updated. 2089 1916 */ 2090 1917 size = i_size_read(mpd->inode); 2091 - if (page->index == size >> PAGE_SHIFT && 1918 + len = folio_size(folio); 1919 + if (folio_pos(folio) + len > size && 2092 1920 !ext4_verity_in_progress(mpd->inode)) 2093 1921 len = size & ~PAGE_MASK; 2094 - else 2095 - len = PAGE_SIZE; 2096 - err = ext4_bio_write_page(&mpd->io_submit, page, len); 1922 + err = ext4_bio_write_folio(&mpd->io_submit, folio, len); 2097 1923 if (!err) 2098 1924 mpd->wbc->nr_to_write--; 2099 - mpd->first_page++; 2100 1925 2101 1926 return err; 2102 1927 } ··· 1999 2240 } while (lblk++, (bh = bh->b_this_page) != head); 2000 2241 /* So far everything mapped? Submit the page for IO. */ 2001 2242 if (mpd->map.m_len == 0) { 2002 - err = mpage_submit_page(mpd, head->b_page); 2243 + err = mpage_submit_folio(mpd, head->b_folio); 2003 2244 if (err < 0) 2004 2245 return err; 2246 + mpage_folio_done(mpd, head->b_folio); 2005 2247 } 2006 2248 if (lblk >= blocks) { 2007 2249 mpd->scanned_until_end = 1; ··· 2012 2252 } 2013 2253 2014 2254 /* 2015 - * mpage_process_page - update page buffers corresponding to changed extent and 2016 - * may submit fully mapped page for IO 2017 - * 2018 - * @mpd - description of extent to map, on return next extent to map 2019 - * @m_lblk - logical block mapping. 2020 - * @m_pblk - corresponding physical mapping. 2021 - * @map_bh - determines on return whether this page requires any further 2255 + * mpage_process_folio - update folio buffers corresponding to changed extent 2256 + * and may submit fully mapped page for IO 2257 + * @mpd: description of extent to map, on return next extent to map 2258 + * @folio: Contains these buffers. 2259 + * @m_lblk: logical block mapping. 2260 + * @m_pblk: corresponding physical mapping. 2261 + * @map_bh: determines on return whether this page requires any further 2022 2262 * mapping or not. 2023 - * Scan given page buffers corresponding to changed extent and update buffer 2263 + * 2264 + * Scan given folio buffers corresponding to changed extent and update buffer 2024 2265 * state according to new extent state. 2025 2266 * We map delalloc buffers to their physical location, clear unwritten bits. 2026 - * If the given page is not fully mapped, we update @map to the next extent in 2027 - * the given page that needs mapping & return @map_bh as true. 2267 + * If the given folio is not fully mapped, we update @mpd to the next extent in 2268 + * the given folio that needs mapping & return @map_bh as true. 2028 2269 */ 2029 - static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, 2270 + static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, 2030 2271 ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, 2031 2272 bool *map_bh) 2032 2273 { ··· 2040 2279 ssize_t io_end_size = 0; 2041 2280 struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); 2042 2281 2043 - bh = head = page_buffers(page); 2282 + bh = head = folio_buffers(folio); 2044 2283 do { 2045 2284 if (lblk < mpd->map.m_lblk) 2046 2285 continue; 2047 2286 if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { 2048 2287 /* 2049 2288 * Buffer after end of mapped extent. 2050 - * Find next buffer in the page to map. 2289 + * Find next buffer in the folio to map. 2051 2290 */ 2052 2291 mpd->map.m_len = 0; 2053 2292 mpd->map.m_flags = 0; ··· 2120 2359 if (nr == 0) 2121 2360 break; 2122 2361 for (i = 0; i < nr; i++) { 2123 - struct page *page = &fbatch.folios[i]->page; 2362 + struct folio *folio = fbatch.folios[i]; 2124 2363 2125 - err = mpage_process_page(mpd, page, &lblk, &pblock, 2364 + err = mpage_process_folio(mpd, folio, &lblk, &pblock, 2126 2365 &map_bh); 2127 2366 /* 2128 2367 * If map_bh is true, means page may require further bh ··· 2132 2371 if (err < 0 || map_bh) 2133 2372 goto out; 2134 2373 /* Page fully mapped - let IO run! */ 2135 - err = mpage_submit_page(mpd, page); 2374 + err = mpage_submit_folio(mpd, folio); 2136 2375 if (err < 0) 2137 2376 goto out; 2377 + mpage_folio_done(mpd, folio); 2138 2378 } 2139 2379 folio_batch_release(&fbatch); 2140 2380 } ··· 2321 2559 MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); 2322 2560 } 2323 2561 2324 - /* Return true if the page needs to be written as part of transaction commit */ 2325 - static bool ext4_page_nomap_can_writeout(struct page *page) 2562 + static int ext4_journal_page_buffers(handle_t *handle, struct page *page, 2563 + int len) 2326 2564 { 2327 - struct buffer_head *bh, *head; 2565 + struct buffer_head *page_bufs = page_buffers(page); 2566 + struct inode *inode = page->mapping->host; 2567 + int ret, err; 2328 2568 2329 - bh = head = page_buffers(page); 2330 - do { 2331 - if (buffer_dirty(bh) && buffer_mapped(bh) && !buffer_delay(bh)) 2332 - return true; 2333 - } while ((bh = bh->b_this_page) != head); 2334 - return false; 2569 + ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, 2570 + NULL, do_journal_get_write_access); 2571 + err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, 2572 + NULL, write_end_fn); 2573 + if (ret == 0) 2574 + ret = err; 2575 + err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); 2576 + if (ret == 0) 2577 + ret = err; 2578 + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; 2579 + 2580 + return ret; 2581 + } 2582 + 2583 + static int mpage_journal_page_buffers(handle_t *handle, 2584 + struct mpage_da_data *mpd, 2585 + struct page *page) 2586 + { 2587 + struct inode *inode = mpd->inode; 2588 + loff_t size = i_size_read(inode); 2589 + int len; 2590 + 2591 + ClearPageChecked(page); 2592 + mpd->wbc->nr_to_write--; 2593 + 2594 + if (page->index == size >> PAGE_SHIFT && 2595 + !ext4_verity_in_progress(inode)) 2596 + len = size & ~PAGE_MASK; 2597 + else 2598 + len = PAGE_SIZE; 2599 + 2600 + return ext4_journal_page_buffers(handle, page, len); 2335 2601 } 2336 2602 2337 2603 /* ··· 2387 2597 struct address_space *mapping = mpd->inode->i_mapping; 2388 2598 struct folio_batch fbatch; 2389 2599 unsigned int nr_folios; 2390 - long left = mpd->wbc->nr_to_write; 2391 2600 pgoff_t index = mpd->first_page; 2392 2601 pgoff_t end = mpd->last_page; 2393 2602 xa_mark_t tag; ··· 2394 2605 int blkbits = mpd->inode->i_blkbits; 2395 2606 ext4_lblk_t lblk; 2396 2607 struct buffer_head *head; 2608 + handle_t *handle = NULL; 2609 + int bpp = ext4_journal_blocks_per_page(mpd->inode); 2397 2610 2398 2611 if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) 2399 2612 tag = PAGECACHE_TAG_TOWRITE; 2400 2613 else 2401 2614 tag = PAGECACHE_TAG_DIRTY; 2402 - folio_batch_init(&fbatch); 2615 + 2403 2616 mpd->map.m_len = 0; 2404 2617 mpd->next_page = index; 2618 + if (ext4_should_journal_data(mpd->inode)) { 2619 + handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, 2620 + bpp); 2621 + if (IS_ERR(handle)) 2622 + return PTR_ERR(handle); 2623 + } 2624 + folio_batch_init(&fbatch); 2405 2625 while (index <= end) { 2406 2626 nr_folios = filemap_get_folios_tag(mapping, &index, end, 2407 2627 tag, &fbatch); ··· 2428 2630 * newly appeared dirty pages, but have not synced all 2429 2631 * of the old dirty pages. 2430 2632 */ 2431 - if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0) 2633 + if (mpd->wbc->sync_mode == WB_SYNC_NONE && 2634 + mpd->wbc->nr_to_write <= 2635 + mpd->map.m_len >> (PAGE_SHIFT - blkbits)) 2432 2636 goto out; 2433 2637 2434 2638 /* If we can't merge this page, we are done. */ 2435 2639 if (mpd->map.m_len > 0 && mpd->next_page != folio->index) 2436 2640 goto out; 2641 + 2642 + if (handle) { 2643 + err = ext4_journal_ensure_credits(handle, bpp, 2644 + 0); 2645 + if (err < 0) 2646 + goto out; 2647 + } 2437 2648 2438 2649 folio_lock(folio); 2439 2650 /* ··· 2483 2676 mpd->first_page = folio->index; 2484 2677 mpd->next_page = folio->index + folio_nr_pages(folio); 2485 2678 /* 2486 - * Writeout for transaction commit where we cannot 2487 - * modify metadata is simple. Just submit the page. 2679 + * Writeout when we cannot modify metadata is simple. 2680 + * Just submit the page. For data=journal mode we 2681 + * first handle writeout of the page for checkpoint and 2682 + * only after that handle delayed page dirtying. This 2683 + * makes sure current data is checkpointed to the final 2684 + * location before possibly journalling it again which 2685 + * is desirable when the page is frequently dirtied 2686 + * through a pin. 2488 2687 */ 2489 2688 if (!mpd->can_map) { 2490 - if (ext4_page_nomap_can_writeout(&folio->page)) { 2491 - err = mpage_submit_page(mpd, &folio->page); 2689 + err = mpage_submit_folio(mpd, folio); 2690 + if (err < 0) 2691 + goto out; 2692 + /* Pending dirtying of journalled data? */ 2693 + if (folio_test_checked(folio)) { 2694 + err = mpage_journal_page_buffers(handle, 2695 + mpd, &folio->page); 2492 2696 if (err < 0) 2493 2697 goto out; 2494 - } else { 2495 - folio_unlock(folio); 2496 - mpd->first_page += folio_nr_pages(folio); 2698 + mpd->journalled_more_data = 1; 2497 2699 } 2700 + mpage_folio_done(mpd, folio); 2498 2701 } else { 2499 2702 /* Add all dirty buffers to mpd */ 2500 2703 lblk = ((ext4_lblk_t)folio->index) << ··· 2516 2699 goto out; 2517 2700 err = 0; 2518 2701 } 2519 - left -= folio_nr_pages(folio); 2520 2702 } 2521 2703 folio_batch_release(&fbatch); 2522 2704 cond_resched(); 2523 2705 } 2524 2706 mpd->scanned_until_end = 1; 2707 + if (handle) 2708 + ext4_journal_stop(handle); 2525 2709 return 0; 2526 2710 out: 2527 2711 folio_batch_release(&fbatch); 2712 + if (handle) 2713 + ext4_journal_stop(handle); 2528 2714 return err; 2529 - } 2530 - 2531 - static int ext4_writepage_cb(struct folio *folio, struct writeback_control *wbc, 2532 - void *data) 2533 - { 2534 - return ext4_writepage(&folio->page, wbc); 2535 2715 } 2536 2716 2537 2717 static int ext4_do_writepages(struct mpage_da_data *mpd) ··· 2555 2741 */ 2556 2742 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2557 2743 goto out_writepages; 2558 - 2559 - if (ext4_should_journal_data(inode)) { 2560 - blk_start_plug(&plug); 2561 - ret = write_cache_pages(mapping, wbc, ext4_writepage_cb, NULL); 2562 - blk_finish_plug(&plug); 2563 - goto out_writepages; 2564 - } 2565 2744 2566 2745 /* 2567 2746 * If the filesystem has aborted, it is read-only, so return ··· 2589 2782 ext4_destroy_inline_data(handle, inode); 2590 2783 ext4_journal_stop(handle); 2591 2784 } 2785 + 2786 + /* 2787 + * data=journal mode does not do delalloc so we just need to writeout / 2788 + * journal already mapped buffers. On the other hand we need to commit 2789 + * transaction to make data stable. We expect all the data to be 2790 + * already in the journal (the only exception are DMA pinned pages 2791 + * dirtied behind our back) so we commit transaction here and run the 2792 + * writeback loop to checkpoint them. The checkpointing is not actually 2793 + * necessary to make data persistent *but* quite a few places (extent 2794 + * shifting operations, fsverity, ...) depend on being able to drop 2795 + * pagecache pages after calling filemap_write_and_wait() and for that 2796 + * checkpointing needs to happen. 2797 + */ 2798 + if (ext4_should_journal_data(inode)) { 2799 + mpd->can_map = 0; 2800 + if (wbc->sync_mode == WB_SYNC_ALL) 2801 + ext4_fc_commit(sbi->s_journal, 2802 + EXT4_I(inode)->i_datasync_tid); 2803 + } 2804 + mpd->journalled_more_data = 0; 2592 2805 2593 2806 if (ext4_should_dioread_nolock(inode)) { 2594 2807 /* ··· 2789 2962 2790 2963 percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); 2791 2964 ret = ext4_do_writepages(&mpd); 2965 + /* 2966 + * For data=journal writeback we could have come across pages marked 2967 + * for delayed dirtying (PageChecked) which were just added to the 2968 + * running transaction. Try once more to get them to stable storage. 2969 + */ 2970 + if (!ret && mpd.journalled_more_data) 2971 + ret = ext4_do_writepages(&mpd); 2792 2972 percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); 2793 2973 2794 2974 return ret; ··· 2877 3043 struct page **pagep, void **fsdata) 2878 3044 { 2879 3045 int ret, retries = 0; 2880 - struct page *page; 3046 + struct folio *folio; 2881 3047 pgoff_t index; 2882 3048 struct inode *inode = mapping->host; 2883 3049 ··· 2904 3070 } 2905 3071 2906 3072 retry: 2907 - page = grab_cache_page_write_begin(mapping, index); 2908 - if (!page) 3073 + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, 3074 + mapping_gfp_mask(mapping)); 3075 + if (!folio) 2909 3076 return -ENOMEM; 2910 3077 2911 - /* In case writeback began while the page was unlocked */ 2912 - wait_for_stable_page(page); 3078 + /* In case writeback began while the folio was unlocked */ 3079 + folio_wait_stable(folio); 2913 3080 2914 3081 #ifdef CONFIG_FS_ENCRYPTION 2915 - ret = ext4_block_write_begin(page, pos, len, 2916 - ext4_da_get_block_prep); 3082 + ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); 2917 3083 #else 2918 - ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); 3084 + ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep); 2919 3085 #endif 2920 3086 if (ret < 0) { 2921 - unlock_page(page); 2922 - put_page(page); 3087 + folio_unlock(folio); 3088 + folio_put(folio); 2923 3089 /* 2924 3090 * block_write_begin may have instantiated a few blocks 2925 3091 * outside i_size. Trim these off again. Don't need ··· 2934 3100 return ret; 2935 3101 } 2936 3102 2937 - *pagep = page; 3103 + *pagep = &folio->page; 2938 3104 return ret; 2939 3105 } 2940 3106 ··· 2993 3159 * i_disksize since writeback will push i_disksize upto i_size 2994 3160 * eventually. If the end of the current write is > i_size and 2995 3161 * inside an allocated block (ext4_da_should_update_i_disksize() 2996 - * check), we need to update i_disksize here as neither 2997 - * ext4_writepage() nor certain ext4_writepages() paths not 2998 - * allocating blocks update i_disksize. 3162 + * check), we need to update i_disksize here as certain 3163 + * ext4_writepages() paths not allocating blocks update i_disksize. 2999 3164 * 3000 3165 * Note that we defer inode dirtying to generic_write_end() / 3001 3166 * ext4_da_write_inline_data_end(). ··· 3068 3235 static sector_t ext4_bmap(struct address_space *mapping, sector_t block) 3069 3236 { 3070 3237 struct inode *inode = mapping->host; 3071 - journal_t *journal; 3072 3238 sector_t ret = 0; 3073 - int err; 3074 3239 3075 3240 inode_lock_shared(inode); 3076 3241 /* ··· 3078 3247 goto out; 3079 3248 3080 3249 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && 3081 - test_opt(inode->i_sb, DELALLOC)) { 3250 + (test_opt(inode->i_sb, DELALLOC) || 3251 + ext4_should_journal_data(inode))) { 3082 3252 /* 3083 - * With delalloc we want to sync the file 3084 - * so that we can make sure we allocate 3085 - * blocks for file 3253 + * With delalloc or journalled data we want to sync the file so 3254 + * that we can make sure we allocate blocks for file and data 3255 + * is in place for the user to see it 3086 3256 */ 3087 3257 filemap_write_and_wait(mapping); 3088 - } 3089 - 3090 - if (EXT4_JOURNAL(inode) && 3091 - ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { 3092 - /* 3093 - * This is a REALLY heavyweight approach, but the use of 3094 - * bmap on dirty files is expected to be extremely rare: 3095 - * only if we run lilo or swapon on a freshly made file 3096 - * do we expect this to happen. 3097 - * 3098 - * (bmap requires CAP_SYS_RAWIO so this does not 3099 - * represent an unprivileged user DOS attack --- we'd be 3100 - * in trouble if mortal users could trigger this path at 3101 - * will.) 3102 - * 3103 - * NB. EXT4_STATE_JDATA is not set on files other than 3104 - * regular files. If somebody wants to bmap a directory 3105 - * or symlink and gets confused because the buffer 3106 - * hasn't yet been flushed to disk, they deserve 3107 - * everything they get. 3108 - */ 3109 - 3110 - ext4_clear_inode_state(inode, EXT4_STATE_JDATA); 3111 - journal = EXT4_JOURNAL(inode); 3112 - jbd2_journal_lock_updates(journal); 3113 - err = jbd2_journal_flush(journal, 0); 3114 - jbd2_journal_unlock_updates(journal); 3115 - 3116 - if (err) 3117 - goto out; 3118 3258 } 3119 3259 3120 3260 ret = iomap_bmap(mapping, block, &ext4_iomap_ops); ··· 3097 3295 3098 3296 static int ext4_read_folio(struct file *file, struct folio *folio) 3099 3297 { 3100 - struct page *page = &folio->page; 3101 3298 int ret = -EAGAIN; 3102 - struct inode *inode = page->mapping->host; 3299 + struct inode *inode = folio->mapping->host; 3103 3300 3104 - trace_ext4_readpage(page); 3301 + trace_ext4_readpage(&folio->page); 3105 3302 3106 3303 if (ext4_has_inline_data(inode)) 3107 - ret = ext4_readpage_inline(inode, page); 3304 + ret = ext4_readpage_inline(inode, folio); 3108 3305 3109 3306 if (ret == -EAGAIN) 3110 - return ext4_mpage_readpages(inode, NULL, page); 3307 + return ext4_mpage_readpages(inode, NULL, folio); 3111 3308 3112 3309 return ret; 3113 3310 } ··· 3487 3686 }; 3488 3687 3489 3688 /* 3490 - * Whenever the folio is being dirtied, corresponding buffers should already 3491 - * be attached to the transaction (we take care of this in ext4_page_mkwrite() 3492 - * and ext4_write_begin()). However we cannot move buffers to dirty transaction 3493 - * lists here because ->dirty_folio is called under VFS locks and the folio 3494 - * is not necessarily locked. 3495 - * 3496 - * We cannot just dirty the folio and leave attached buffers clean, because the 3497 - * buffers' dirty state is "definitive". We cannot just set the buffers dirty 3498 - * or jbddirty because all the journalling code will explode. 3499 - * 3500 - * So what we do is to mark the folio "pending dirty" and next time writepage 3501 - * is called, propagate that into the buffers appropriately. 3689 + * For data=journal mode, folio should be marked dirty only when it was 3690 + * writeably mapped. When that happens, it was already attached to the 3691 + * transaction and marked as jbddirty (we take care of this in 3692 + * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings 3693 + * so we should have nothing to do here, except for the case when someone 3694 + * had the page pinned and dirtied the page through this pin (e.g. by doing 3695 + * direct IO to it). In that case we'd need to attach buffers here to the 3696 + * transaction but we cannot due to lock ordering. We cannot just dirty the 3697 + * folio and leave attached buffers clean, because the buffers' dirty state is 3698 + * "definitive". We cannot just set the buffers dirty or jbddirty because all 3699 + * the journalling code will explode. So what we do is to mark the folio 3700 + * "pending dirty" and next time ext4_writepages() is called, attach buffers 3701 + * to the transaction appropriately. 3502 3702 */ 3503 3703 static bool ext4_journalled_dirty_folio(struct address_space *mapping, 3504 3704 struct folio *folio) 3505 3705 { 3506 3706 WARN_ON_ONCE(!folio_buffers(folio)); 3507 - folio_set_checked(folio); 3707 + if (folio_maybe_dma_pinned(folio)) 3708 + folio_set_checked(folio); 3508 3709 return filemap_dirty_folio(mapping, folio); 3509 3710 } 3510 3711 ··· 3612 3809 ext4_lblk_t iblock; 3613 3810 struct inode *inode = mapping->host; 3614 3811 struct buffer_head *bh; 3615 - struct page *page; 3812 + struct folio *folio; 3616 3813 int err = 0; 3617 3814 3618 - page = find_or_create_page(mapping, from >> PAGE_SHIFT, 3619 - mapping_gfp_constraint(mapping, ~__GFP_FS)); 3620 - if (!page) 3815 + folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT, 3816 + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, 3817 + mapping_gfp_constraint(mapping, ~__GFP_FS)); 3818 + if (!folio) 3621 3819 return -ENOMEM; 3622 3820 3623 3821 blocksize = inode->i_sb->s_blocksize; 3624 3822 3625 3823 iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); 3626 3824 3627 - if (!page_has_buffers(page)) 3628 - create_empty_buffers(page, blocksize, 0); 3825 + bh = folio_buffers(folio); 3826 + if (!bh) { 3827 + create_empty_buffers(&folio->page, blocksize, 0); 3828 + bh = folio_buffers(folio); 3829 + } 3629 3830 3630 3831 /* Find the buffer that contains "offset" */ 3631 - bh = page_buffers(page); 3632 3832 pos = blocksize; 3633 3833 while (offset >= pos) { 3634 3834 bh = bh->b_this_page; ··· 3653 3847 } 3654 3848 3655 3849 /* Ok, it's mapped. Make sure it's up-to-date */ 3656 - if (PageUptodate(page)) 3850 + if (folio_test_uptodate(folio)) 3657 3851 set_buffer_uptodate(bh); 3658 3852 3659 3853 if (!buffer_uptodate(bh)) { ··· 3663 3857 if (fscrypt_inode_uses_fs_layer_crypto(inode)) { 3664 3858 /* We expect the key to be set. */ 3665 3859 BUG_ON(!fscrypt_has_encryption_key(inode)); 3666 - err = fscrypt_decrypt_pagecache_blocks(page_folio(page), 3860 + err = fscrypt_decrypt_pagecache_blocks(folio, 3667 3861 blocksize, 3668 3862 bh_offset(bh)); 3669 3863 if (err) { ··· 3679 3873 if (err) 3680 3874 goto unlock; 3681 3875 } 3682 - zero_user(page, offset, length); 3876 + folio_zero_range(folio, offset, length); 3683 3877 BUFFER_TRACE(bh, "zeroed end of block"); 3684 3878 3685 3879 if (ext4_should_journal_data(inode)) { 3686 - err = ext4_handle_dirty_metadata(handle, inode, bh); 3880 + err = ext4_dirty_journalled_data(handle, bh); 3687 3881 } else { 3688 3882 err = 0; 3689 3883 mark_buffer_dirty(bh); ··· 3693 3887 } 3694 3888 3695 3889 unlock: 3696 - unlock_page(page); 3697 - put_page(page); 3890 + folio_unlock(folio); 3891 + folio_put(folio); 3698 3892 return err; 3699 3893 } 3700 3894 ··· 5191 5385 * If the folio is fully truncated, we don't need to wait for any commit 5192 5386 * (and we even should not as __ext4_journalled_invalidate_folio() may 5193 5387 * strip all buffers from the folio but keep the folio dirty which can then 5194 - * confuse e.g. concurrent ext4_writepage() seeing dirty folio without 5388 + * confuse e.g. concurrent ext4_writepages() seeing dirty folio without 5195 5389 * buffers). Also we don't need to wait for any commit if all buffers in 5196 5390 * the folio remain valid. This is most beneficial for the common case of 5197 5391 * blocksize == PAGESIZE. ··· 6018 6212 vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) 6019 6213 { 6020 6214 struct vm_area_struct *vma = vmf->vma; 6021 - struct page *page = vmf->page; 6215 + struct folio *folio = page_folio(vmf->page); 6022 6216 loff_t size; 6023 6217 unsigned long len; 6024 6218 int err; ··· 6062 6256 goto out_ret; 6063 6257 } 6064 6258 6065 - lock_page(page); 6259 + folio_lock(folio); 6066 6260 size = i_size_read(inode); 6067 6261 /* Page got truncated from under us? */ 6068 - if (page->mapping != mapping || page_offset(page) > size) { 6069 - unlock_page(page); 6262 + if (folio->mapping != mapping || folio_pos(folio) > size) { 6263 + folio_unlock(folio); 6070 6264 ret = VM_FAULT_NOPAGE; 6071 6265 goto out; 6072 6266 } 6073 6267 6074 - if (page->index == size >> PAGE_SHIFT) 6075 - len = size & ~PAGE_MASK; 6076 - else 6077 - len = PAGE_SIZE; 6268 + len = folio_size(folio); 6269 + if (folio_pos(folio) + len > size) 6270 + len = size - folio_pos(folio); 6078 6271 /* 6079 6272 * Return if we have all the buffers mapped. This avoids the need to do 6080 6273 * journal_start/journal_stop which can block and take a long time ··· 6081 6276 * This cannot be done for data journalling, as we have to add the 6082 6277 * inode to the transaction's list to writeprotect pages on commit. 6083 6278 */ 6084 - if (page_has_buffers(page)) { 6085 - if (!ext4_walk_page_buffers(NULL, inode, page_buffers(page), 6279 + if (folio_buffers(folio)) { 6280 + if (!ext4_walk_page_buffers(NULL, inode, folio_buffers(folio), 6086 6281 0, len, NULL, 6087 6282 ext4_bh_unmapped)) { 6088 6283 /* Wait so that we don't change page under IO */ 6089 - wait_for_stable_page(page); 6284 + folio_wait_stable(folio); 6090 6285 ret = VM_FAULT_LOCKED; 6091 6286 goto out; 6092 6287 } 6093 6288 } 6094 - unlock_page(page); 6289 + folio_unlock(folio); 6095 6290 /* OK, we need to fill the hole... */ 6096 6291 if (ext4_should_dioread_nolock(inode)) 6097 6292 get_block = ext4_get_block_unwritten; ··· 6112 6307 if (!ext4_should_journal_data(inode)) { 6113 6308 err = block_page_mkwrite(vma, vmf, get_block); 6114 6309 } else { 6115 - lock_page(page); 6310 + folio_lock(folio); 6116 6311 size = i_size_read(inode); 6117 6312 /* Page got truncated from under us? */ 6118 - if (page->mapping != mapping || page_offset(page) > size) { 6313 + if (folio->mapping != mapping || folio_pos(folio) > size) { 6119 6314 ret = VM_FAULT_NOPAGE; 6120 6315 goto out_error; 6121 6316 } 6122 6317 6123 - if (page->index == size >> PAGE_SHIFT) 6124 - len = size & ~PAGE_MASK; 6125 - else 6126 - len = PAGE_SIZE; 6318 + len = folio_size(folio); 6319 + if (folio_pos(folio) + len > size) 6320 + len = size - folio_pos(folio); 6127 6321 6128 - err = __block_write_begin(page, 0, len, ext4_get_block); 6322 + err = __block_write_begin(&folio->page, 0, len, ext4_get_block); 6129 6323 if (!err) { 6130 6324 ret = VM_FAULT_SIGBUS; 6131 - if (ext4_walk_page_buffers(handle, inode, 6132 - page_buffers(page), 0, len, NULL, 6133 - do_journal_get_write_access)) 6325 + if (ext4_journal_page_buffers(handle, &folio->page, len)) 6134 6326 goto out_error; 6135 - if (ext4_walk_page_buffers(handle, inode, 6136 - page_buffers(page), 0, len, NULL, 6137 - write_end_fn)) 6138 - goto out_error; 6139 - if (ext4_jbd2_inode_add_write(handle, inode, 6140 - page_offset(page), len)) 6141 - goto out_error; 6142 - ext4_set_inode_state(inode, EXT4_STATE_JDATA); 6143 6327 } else { 6144 - unlock_page(page); 6328 + folio_unlock(folio); 6145 6329 } 6146 6330 } 6147 6331 ext4_journal_stop(handle); ··· 6143 6349 sb_end_pagefault(inode->i_sb); 6144 6350 return ret; 6145 6351 out_error: 6146 - unlock_page(page); 6352 + folio_unlock(folio); 6147 6353 ext4_journal_stop(handle); 6148 6354 goto out; 6149 6355 }
+436 -257
fs/ext4/mballoc.c
··· 1168 1168 if (groups_per_page > 1) { 1169 1169 i = sizeof(struct buffer_head *) * groups_per_page; 1170 1170 bh = kzalloc(i, gfp); 1171 - if (bh == NULL) { 1172 - err = -ENOMEM; 1173 - goto out; 1174 - } 1171 + if (bh == NULL) 1172 + return -ENOMEM; 1175 1173 } else 1176 1174 bh = &bhs; 1177 1175 ··· 1487 1489 put_page(page); 1488 1490 page = find_or_create_page(inode->i_mapping, pnum, gfp); 1489 1491 if (page) { 1490 - BUG_ON(page->mapping != inode->i_mapping); 1492 + if (WARN_RATELIMIT(page->mapping != inode->i_mapping, 1493 + "ext4: bitmap's paging->mapping != inode->i_mapping\n")) { 1494 + /* should never happen */ 1495 + unlock_page(page); 1496 + ret = -EINVAL; 1497 + goto err; 1498 + } 1491 1499 if (!PageUptodate(page)) { 1492 1500 ret = ext4_mb_init_cache(page, NULL, gfp); 1493 1501 if (ret) { ··· 1529 1525 put_page(page); 1530 1526 page = find_or_create_page(inode->i_mapping, pnum, gfp); 1531 1527 if (page) { 1532 - BUG_ON(page->mapping != inode->i_mapping); 1528 + if (WARN_RATELIMIT(page->mapping != inode->i_mapping, 1529 + "ext4: buddy bitmap's page->mapping != inode->i_mapping\n")) { 1530 + /* should never happen */ 1531 + unlock_page(page); 1532 + ret = -EINVAL; 1533 + goto err; 1534 + } 1533 1535 if (!PageUptodate(page)) { 1534 1536 ret = ext4_mb_init_cache(page, e4b->bd_bitmap, 1535 1537 gfp); ··· 1567 1557 put_page(page); 1568 1558 if (e4b->bd_bitmap_page) 1569 1559 put_page(e4b->bd_bitmap_page); 1570 - if (e4b->bd_buddy_page) 1571 - put_page(e4b->bd_buddy_page); 1560 + 1572 1561 e4b->bd_buddy = NULL; 1573 1562 e4b->bd_bitmap = NULL; 1574 1563 return ret; ··· 1730 1721 break; 1731 1722 order++; 1732 1723 1733 - if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) { 1724 + buddy2 = mb_find_buddy(e4b, order, &max); 1725 + if (!buddy2) { 1734 1726 mb_clear_bits(buddy, first, last - first + 1); 1735 1727 e4b->bd_info->bb_counters[order - 1] += last - first + 1; 1736 1728 break; ··· 2031 2021 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 2032 2022 struct ext4_free_extent *bex = &ac->ac_b_ex; 2033 2023 struct ext4_free_extent *gex = &ac->ac_g_ex; 2034 - struct ext4_free_extent ex; 2035 - int max; 2036 2024 2037 2025 if (ac->ac_status == AC_STATUS_FOUND) 2038 2026 return; ··· 2049 2041 if (bex->fe_len < gex->fe_len) 2050 2042 return; 2051 2043 2052 - if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan) 2053 - && bex->fe_group == e4b->bd_group) { 2054 - /* recheck chunk's availability - we don't know 2055 - * when it was found (within this lock-unlock 2056 - * period or not) */ 2057 - max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex); 2058 - if (max >= gex->fe_len) { 2059 - ext4_mb_use_best_found(ac, e4b); 2060 - return; 2061 - } 2062 - } 2044 + if (finish_group) 2045 + ext4_mb_use_best_found(ac, e4b); 2063 2046 } 2064 2047 2065 2048 /* ··· 2123 2124 } 2124 2125 2125 2126 static noinline_for_stack 2126 - int ext4_mb_try_best_found(struct ext4_allocation_context *ac, 2127 + void ext4_mb_try_best_found(struct ext4_allocation_context *ac, 2127 2128 struct ext4_buddy *e4b) 2128 2129 { 2129 2130 struct ext4_free_extent ex = ac->ac_b_ex; ··· 2134 2135 BUG_ON(ex.fe_len <= 0); 2135 2136 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 2136 2137 if (err) 2137 - return err; 2138 + return; 2138 2139 2139 2140 ext4_lock_group(ac->ac_sb, group); 2140 2141 max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); ··· 2146 2147 2147 2148 ext4_unlock_group(ac->ac_sb, group); 2148 2149 ext4_mb_unload_buddy(e4b); 2149 - 2150 - return 0; 2151 2150 } 2152 2151 2153 2152 static noinline_for_stack ··· 2159 2162 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 2160 2163 struct ext4_free_extent ex; 2161 2164 2162 - if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) 2165 + if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY))) 2163 2166 return 0; 2164 2167 if (grp->bb_free == 0) 2165 2168 return 0; ··· 2233 2236 continue; 2234 2237 2235 2238 buddy = mb_find_buddy(e4b, i, &max); 2236 - BUG_ON(buddy == NULL); 2239 + if (WARN_RATELIMIT(buddy == NULL, 2240 + "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i)) 2241 + continue; 2237 2242 2238 2243 k = mb_find_next_zero_bit(buddy, max, 0); 2239 2244 if (k >= max) { ··· 2568 2569 void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, 2569 2570 unsigned int nr) 2570 2571 { 2571 - while (nr-- > 0) { 2572 - struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, 2573 - NULL); 2574 - struct ext4_group_info *grp = ext4_get_group_info(sb, group); 2572 + struct ext4_group_desc *gdp; 2573 + struct ext4_group_info *grp; 2575 2574 2575 + while (nr-- > 0) { 2576 2576 if (!group) 2577 2577 group = ext4_get_groups_count(sb); 2578 2578 group--; 2579 + gdp = ext4_get_group_desc(sb, group, NULL); 2579 2580 grp = ext4_get_group_info(sb, group); 2580 2581 2581 2582 if (EXT4_MB_GRP_NEED_INIT(grp) && ··· 3083 3084 if (meta_group_info == NULL) { 3084 3085 ext4_msg(sb, KERN_ERR, "can't allocate mem " 3085 3086 "for a buddy group"); 3086 - goto exit_meta_group_info; 3087 + return -ENOMEM; 3087 3088 } 3088 3089 rcu_read_lock(); 3089 3090 rcu_dereference(sbi->s_group_info)[idx] = meta_group_info; ··· 3137 3138 group_info[idx] = NULL; 3138 3139 rcu_read_unlock(); 3139 3140 } 3140 - exit_meta_group_info: 3141 3141 return -ENOMEM; 3142 3142 } /* ext4_mb_add_groupinfo */ 3143 3143 ··· 3417 3419 sbi->s_mb_stats = MB_DEFAULT_STATS; 3418 3420 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 3419 3421 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 3420 - sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC; 3421 3422 /* 3422 3423 * The default group preallocation is 512, which for 4k block 3423 3424 * sizes translates to 2 megabytes. However for bigalloc file ··· 3603 3606 { 3604 3607 struct ext4_buddy e4b; 3605 3608 struct ext4_group_info *db; 3606 - int err, count = 0, count2 = 0; 3609 + int err, count = 0; 3607 3610 3608 3611 mb_debug(sb, "gonna free %u blocks in group %u (0x%p):", 3609 3612 entry->efd_count, entry->efd_group, entry); ··· 3619 3622 db = e4b.bd_info; 3620 3623 /* there are blocks to put in buddy to make them really free */ 3621 3624 count += entry->efd_count; 3622 - count2++; 3623 3625 ext4_lock_group(sb, entry->efd_group); 3624 3626 /* Take it out of per group rb tree */ 3625 3627 rb_erase(&entry->efd_node, &(db->bb_free_root)); ··· 3643 3647 ext4_unlock_group(sb, entry->efd_group); 3644 3648 ext4_mb_unload_buddy(&e4b); 3645 3649 3646 - mb_debug(sb, "freed %d blocks in %d structures\n", count, 3647 - count2); 3650 + mb_debug(sb, "freed %d blocks in 1 structures\n", count); 3648 3651 } 3649 3652 3650 3653 /* ··· 3752 3757 3753 3758 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); 3754 3759 if (IS_ERR(bitmap_bh)) { 3755 - err = PTR_ERR(bitmap_bh); 3756 - bitmap_bh = NULL; 3757 - goto out_err; 3760 + return PTR_ERR(bitmap_bh); 3758 3761 } 3759 3762 3760 3763 BUFFER_TRACE(bitmap_bh, "getting write access"); ··· 3815 3822 } 3816 3823 len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; 3817 3824 ext4_free_group_clusters_set(sb, gdp, len); 3818 - ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh); 3825 + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); 3819 3826 ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); 3820 3827 3821 3828 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); ··· 3922 3929 clen = ext4_free_group_clusters(sb, gdp) + clen_changed; 3923 3930 3924 3931 ext4_free_group_clusters_set(sb, gdp, clen); 3925 - ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh); 3932 + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); 3926 3933 ext4_group_desc_csum_set(sb, group, gdp); 3927 3934 3928 3935 ext4_unlock_group(sb, group); ··· 3978 3985 } 3979 3986 3980 3987 /* 3988 + * This function returns the next element to look at during inode 3989 + * PA rbtree walk. We assume that we have held the inode PA rbtree lock 3990 + * (ei->i_prealloc_lock) 3991 + * 3992 + * new_start The start of the range we want to compare 3993 + * cur_start The existing start that we are comparing against 3994 + * node The node of the rb_tree 3995 + */ 3996 + static inline struct rb_node* 3997 + ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node) 3998 + { 3999 + if (new_start < cur_start) 4000 + return node->rb_left; 4001 + else 4002 + return node->rb_right; 4003 + } 4004 + 4005 + static inline void 4006 + ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, 4007 + ext4_lblk_t start, ext4_lblk_t end) 4008 + { 4009 + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4010 + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 4011 + struct ext4_prealloc_space *tmp_pa; 4012 + ext4_lblk_t tmp_pa_start, tmp_pa_end; 4013 + struct rb_node *iter; 4014 + 4015 + read_lock(&ei->i_prealloc_lock); 4016 + for (iter = ei->i_prealloc_node.rb_node; iter; 4017 + iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) { 4018 + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4019 + pa_node.inode_node); 4020 + tmp_pa_start = tmp_pa->pa_lstart; 4021 + tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); 4022 + 4023 + spin_lock(&tmp_pa->pa_lock); 4024 + if (tmp_pa->pa_deleted == 0) 4025 + BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start)); 4026 + spin_unlock(&tmp_pa->pa_lock); 4027 + } 4028 + read_unlock(&ei->i_prealloc_lock); 4029 + } 4030 + 4031 + /* 4032 + * Given an allocation context "ac" and a range "start", "end", check 4033 + * and adjust boundaries if the range overlaps with any of the existing 4034 + * preallocatoins stored in the corresponding inode of the allocation context. 4035 + * 4036 + * Parameters: 4037 + * ac allocation context 4038 + * start start of the new range 4039 + * end end of the new range 4040 + */ 4041 + static inline void 4042 + ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, 4043 + ext4_lblk_t *start, ext4_lblk_t *end) 4044 + { 4045 + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 4046 + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4047 + struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL; 4048 + struct rb_node *iter; 4049 + ext4_lblk_t new_start, new_end; 4050 + ext4_lblk_t tmp_pa_start, tmp_pa_end, left_pa_end = -1, right_pa_start = -1; 4051 + 4052 + new_start = *start; 4053 + new_end = *end; 4054 + 4055 + /* 4056 + * Adjust the normalized range so that it doesn't overlap with any 4057 + * existing preallocated blocks(PAs). Make sure to hold the rbtree lock 4058 + * so it doesn't change underneath us. 4059 + */ 4060 + read_lock(&ei->i_prealloc_lock); 4061 + 4062 + /* Step 1: find any one immediate neighboring PA of the normalized range */ 4063 + for (iter = ei->i_prealloc_node.rb_node; iter; 4064 + iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, 4065 + tmp_pa_start, iter)) { 4066 + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4067 + pa_node.inode_node); 4068 + tmp_pa_start = tmp_pa->pa_lstart; 4069 + tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); 4070 + 4071 + /* PA must not overlap original request */ 4072 + spin_lock(&tmp_pa->pa_lock); 4073 + if (tmp_pa->pa_deleted == 0) 4074 + BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end || 4075 + ac->ac_o_ex.fe_logical < tmp_pa_start)); 4076 + spin_unlock(&tmp_pa->pa_lock); 4077 + } 4078 + 4079 + /* 4080 + * Step 2: check if the found PA is left or right neighbor and 4081 + * get the other neighbor 4082 + */ 4083 + if (tmp_pa) { 4084 + if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) { 4085 + struct rb_node *tmp; 4086 + 4087 + left_pa = tmp_pa; 4088 + tmp = rb_next(&left_pa->pa_node.inode_node); 4089 + if (tmp) { 4090 + right_pa = rb_entry(tmp, 4091 + struct ext4_prealloc_space, 4092 + pa_node.inode_node); 4093 + } 4094 + } else { 4095 + struct rb_node *tmp; 4096 + 4097 + right_pa = tmp_pa; 4098 + tmp = rb_prev(&right_pa->pa_node.inode_node); 4099 + if (tmp) { 4100 + left_pa = rb_entry(tmp, 4101 + struct ext4_prealloc_space, 4102 + pa_node.inode_node); 4103 + } 4104 + } 4105 + } 4106 + 4107 + /* Step 3: get the non deleted neighbors */ 4108 + if (left_pa) { 4109 + for (iter = &left_pa->pa_node.inode_node;; 4110 + iter = rb_prev(iter)) { 4111 + if (!iter) { 4112 + left_pa = NULL; 4113 + break; 4114 + } 4115 + 4116 + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4117 + pa_node.inode_node); 4118 + left_pa = tmp_pa; 4119 + spin_lock(&tmp_pa->pa_lock); 4120 + if (tmp_pa->pa_deleted == 0) { 4121 + spin_unlock(&tmp_pa->pa_lock); 4122 + break; 4123 + } 4124 + spin_unlock(&tmp_pa->pa_lock); 4125 + } 4126 + } 4127 + 4128 + if (right_pa) { 4129 + for (iter = &right_pa->pa_node.inode_node;; 4130 + iter = rb_next(iter)) { 4131 + if (!iter) { 4132 + right_pa = NULL; 4133 + break; 4134 + } 4135 + 4136 + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4137 + pa_node.inode_node); 4138 + right_pa = tmp_pa; 4139 + spin_lock(&tmp_pa->pa_lock); 4140 + if (tmp_pa->pa_deleted == 0) { 4141 + spin_unlock(&tmp_pa->pa_lock); 4142 + break; 4143 + } 4144 + spin_unlock(&tmp_pa->pa_lock); 4145 + } 4146 + } 4147 + 4148 + if (left_pa) { 4149 + left_pa_end = 4150 + left_pa->pa_lstart + EXT4_C2B(sbi, left_pa->pa_len); 4151 + BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical); 4152 + } 4153 + 4154 + if (right_pa) { 4155 + right_pa_start = right_pa->pa_lstart; 4156 + BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical); 4157 + } 4158 + 4159 + /* Step 4: trim our normalized range to not overlap with the neighbors */ 4160 + if (left_pa) { 4161 + if (left_pa_end > new_start) 4162 + new_start = left_pa_end; 4163 + } 4164 + 4165 + if (right_pa) { 4166 + if (right_pa_start < new_end) 4167 + new_end = right_pa_start; 4168 + } 4169 + read_unlock(&ei->i_prealloc_lock); 4170 + 4171 + /* XXX: extra loop to check we really don't overlap preallocations */ 4172 + ext4_mb_pa_assert_overlap(ac, new_start, new_end); 4173 + 4174 + *start = new_start; 4175 + *end = new_end; 4176 + } 4177 + 4178 + /* 3981 4179 * Normalization means making request better in terms of 3982 4180 * size and alignment 3983 4181 */ ··· 4177 3993 struct ext4_allocation_request *ar) 4178 3994 { 4179 3995 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 3996 + struct ext4_super_block *es = sbi->s_es; 4180 3997 int bsbits, max; 4181 3998 ext4_lblk_t end; 4182 3999 loff_t size, start_off; 4183 4000 loff_t orig_size __maybe_unused; 4184 4001 ext4_lblk_t start; 4185 - struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 4186 - struct ext4_prealloc_space *pa; 4187 4002 4188 4003 /* do normalize only data requests, metadata requests 4189 4004 do not need preallocation */ ··· 4251 4068 size = 8 * 1024 * 1024; 4252 4069 } else { 4253 4070 start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; 4254 - size = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb), 4071 + size = (loff_t) EXT4_C2B(sbi, 4255 4072 ac->ac_o_ex.fe_len) << bsbits; 4256 4073 } 4257 4074 size = size >> bsbits; ··· 4283 4100 4284 4101 end = start + size; 4285 4102 4286 - /* check we don't cross already preallocated blocks */ 4287 - rcu_read_lock(); 4288 - list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 4289 - ext4_lblk_t pa_end; 4103 + ext4_mb_pa_adjust_overlap(ac, &start, &end); 4290 4104 4291 - if (pa->pa_deleted) 4292 - continue; 4293 - spin_lock(&pa->pa_lock); 4294 - if (pa->pa_deleted) { 4295 - spin_unlock(&pa->pa_lock); 4296 - continue; 4297 - } 4298 - 4299 - pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), 4300 - pa->pa_len); 4301 - 4302 - /* PA must not overlap original request */ 4303 - BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || 4304 - ac->ac_o_ex.fe_logical < pa->pa_lstart)); 4305 - 4306 - /* skip PAs this normalized request doesn't overlap with */ 4307 - if (pa->pa_lstart >= end || pa_end <= start) { 4308 - spin_unlock(&pa->pa_lock); 4309 - continue; 4310 - } 4311 - BUG_ON(pa->pa_lstart <= start && pa_end >= end); 4312 - 4313 - /* adjust start or end to be adjacent to this pa */ 4314 - if (pa_end <= ac->ac_o_ex.fe_logical) { 4315 - BUG_ON(pa_end < start); 4316 - start = pa_end; 4317 - } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { 4318 - BUG_ON(pa->pa_lstart > end); 4319 - end = pa->pa_lstart; 4320 - } 4321 - spin_unlock(&pa->pa_lock); 4322 - } 4323 - rcu_read_unlock(); 4324 4105 size = end - start; 4325 - 4326 - /* XXX: extra loop to check we really don't overlap preallocations */ 4327 - rcu_read_lock(); 4328 - list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 4329 - ext4_lblk_t pa_end; 4330 - 4331 - spin_lock(&pa->pa_lock); 4332 - if (pa->pa_deleted == 0) { 4333 - pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), 4334 - pa->pa_len); 4335 - BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); 4336 - } 4337 - spin_unlock(&pa->pa_lock); 4338 - } 4339 - rcu_read_unlock(); 4340 4106 4341 4107 /* 4342 4108 * In this function "start" and "size" are normalized for better ··· 4297 4165 * provide gurantee on number of contiguous blocks allocation since that 4298 4166 * depends upon free space left, etc). 4299 4167 * In case of inode pa, later we use the allocated blocks 4300 - * [pa_start + fe_logical - pa_lstart, fe_len/size] from the preallocated 4168 + * [pa_pstart + fe_logical - pa_lstart, fe_len/size] from the preallocated 4301 4169 * range of goal/best blocks [start, size] to put it at the 4302 4170 * ac_o_ex.fe_logical extent of this inode. 4303 4171 * (See ext4_mb_use_inode_pa() for more details) ··· 4320 4188 ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); 4321 4189 4322 4190 /* define goal start in order to merge */ 4323 - if (ar->pright && (ar->lright == (start + size))) { 4191 + if (ar->pright && (ar->lright == (start + size)) && 4192 + ar->pright >= size && 4193 + ar->pright - size >= le32_to_cpu(es->s_first_data_block)) { 4324 4194 /* merge to the right */ 4325 4195 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, 4326 - &ac->ac_f_ex.fe_group, 4327 - &ac->ac_f_ex.fe_start); 4196 + &ac->ac_g_ex.fe_group, 4197 + &ac->ac_g_ex.fe_start); 4328 4198 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 4329 4199 } 4330 - if (ar->pleft && (ar->lleft + 1 == start)) { 4200 + if (ar->pleft && (ar->lleft + 1 == start) && 4201 + ar->pleft + 1 < ext4_blocks_count(es)) { 4331 4202 /* merge to the left */ 4332 4203 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, 4333 - &ac->ac_f_ex.fe_group, 4334 - &ac->ac_f_ex.fe_start); 4204 + &ac->ac_g_ex.fe_group, 4205 + &ac->ac_g_ex.fe_start); 4335 4206 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 4336 4207 } 4337 4208 ··· 4382 4247 if (ac->ac_f_ex.fe_len == 0) 4383 4248 return; 4384 4249 err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); 4385 - if (err) { 4250 + if (WARN_RATELIMIT(err, 4251 + "ext4: mb_load_buddy failed (%d)", err)) 4386 4252 /* 4387 4253 * This should never happen since we pin the 4388 4254 * pages in the ext4_allocation_context so 4389 4255 * ext4_mb_load_buddy() should never fail. 4390 4256 */ 4391 - WARN(1, "mb_load_buddy failed (%d)", err); 4392 4257 return; 4393 - } 4394 4258 ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group); 4395 4259 mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, 4396 4260 ac->ac_f_ex.fe_len); ··· 4397 4263 ext4_mb_unload_buddy(&e4b); 4398 4264 return; 4399 4265 } 4400 - if (pa->pa_type == MB_INODE_PA) 4266 + if (pa->pa_type == MB_INODE_PA) { 4267 + spin_lock(&pa->pa_lock); 4401 4268 pa->pa_free += ac->ac_b_ex.fe_len; 4269 + spin_unlock(&pa->pa_lock); 4270 + } 4402 4271 } 4403 4272 4404 4273 /* ··· 4429 4292 BUG_ON(start < pa->pa_pstart); 4430 4293 BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); 4431 4294 BUG_ON(pa->pa_free < len); 4295 + BUG_ON(ac->ac_b_ex.fe_len <= 0); 4432 4296 pa->pa_free -= len; 4433 4297 4434 4298 mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa); ··· 4450 4312 ac->ac_status = AC_STATUS_FOUND; 4451 4313 ac->ac_pa = pa; 4452 4314 4453 - /* we don't correct pa_pstart or pa_plen here to avoid 4315 + /* we don't correct pa_pstart or pa_len here to avoid 4454 4316 * possible race when the group is being loaded concurrently 4455 4317 * instead we correct pa later, after blocks are marked 4456 4318 * in on-disk bitmap -- see ext4_mb_release_context() 4457 4319 * Other CPUs are prevented from allocating from this pa by lg_mutex 4458 4320 */ 4459 4321 mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n", 4460 - pa->pa_lstart-len, len, pa); 4322 + pa->pa_lstart, len, pa); 4461 4323 } 4462 4324 4463 4325 /* ··· 4499 4361 int order, i; 4500 4362 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 4501 4363 struct ext4_locality_group *lg; 4502 - struct ext4_prealloc_space *pa, *cpa = NULL; 4364 + struct ext4_prealloc_space *tmp_pa, *cpa = NULL; 4365 + ext4_lblk_t tmp_pa_start, tmp_pa_end; 4366 + struct rb_node *iter; 4503 4367 ext4_fsblk_t goal_block; 4504 4368 4505 4369 /* only data can be preallocated */ ··· 4509 4369 return false; 4510 4370 4511 4371 /* first, try per-file preallocation */ 4512 - rcu_read_lock(); 4513 - list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 4372 + read_lock(&ei->i_prealloc_lock); 4373 + for (iter = ei->i_prealloc_node.rb_node; iter; 4374 + iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, 4375 + tmp_pa_start, iter)) { 4376 + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4377 + pa_node.inode_node); 4514 4378 4515 4379 /* all fields in this condition don't change, 4516 4380 * so we can skip locking for them */ 4517 - if (ac->ac_o_ex.fe_logical < pa->pa_lstart || 4518 - ac->ac_o_ex.fe_logical >= (pa->pa_lstart + 4519 - EXT4_C2B(sbi, pa->pa_len))) 4381 + tmp_pa_start = tmp_pa->pa_lstart; 4382 + tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); 4383 + 4384 + /* original request start doesn't lie in this PA */ 4385 + if (ac->ac_o_ex.fe_logical < tmp_pa_start || 4386 + ac->ac_o_ex.fe_logical >= tmp_pa_end) 4520 4387 continue; 4521 4388 4522 4389 /* non-extent files can't have physical blocks past 2^32 */ 4523 4390 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && 4524 - (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) > 4525 - EXT4_MAX_BLOCK_FILE_PHYS)) 4526 - continue; 4391 + (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) > 4392 + EXT4_MAX_BLOCK_FILE_PHYS)) { 4393 + /* 4394 + * Since PAs don't overlap, we won't find any 4395 + * other PA to satisfy this. 4396 + */ 4397 + break; 4398 + } 4527 4399 4528 4400 /* found preallocated blocks, use them */ 4529 - spin_lock(&pa->pa_lock); 4530 - if (pa->pa_deleted == 0 && pa->pa_free) { 4531 - atomic_inc(&pa->pa_count); 4532 - ext4_mb_use_inode_pa(ac, pa); 4533 - spin_unlock(&pa->pa_lock); 4401 + spin_lock(&tmp_pa->pa_lock); 4402 + if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free) { 4403 + atomic_inc(&tmp_pa->pa_count); 4404 + ext4_mb_use_inode_pa(ac, tmp_pa); 4405 + spin_unlock(&tmp_pa->pa_lock); 4534 4406 ac->ac_criteria = 10; 4535 - rcu_read_unlock(); 4407 + read_unlock(&ei->i_prealloc_lock); 4536 4408 return true; 4537 4409 } 4538 - spin_unlock(&pa->pa_lock); 4410 + spin_unlock(&tmp_pa->pa_lock); 4539 4411 } 4540 - rcu_read_unlock(); 4412 + read_unlock(&ei->i_prealloc_lock); 4541 4413 4542 4414 /* can we use group allocation? */ 4543 4415 if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) ··· 4571 4419 */ 4572 4420 for (i = order; i < PREALLOC_TB_SIZE; i++) { 4573 4421 rcu_read_lock(); 4574 - list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], 4575 - pa_inode_list) { 4576 - spin_lock(&pa->pa_lock); 4577 - if (pa->pa_deleted == 0 && 4578 - pa->pa_free >= ac->ac_o_ex.fe_len) { 4422 + list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[i], 4423 + pa_node.lg_list) { 4424 + spin_lock(&tmp_pa->pa_lock); 4425 + if (tmp_pa->pa_deleted == 0 && 4426 + tmp_pa->pa_free >= ac->ac_o_ex.fe_len) { 4579 4427 4580 4428 cpa = ext4_mb_check_group_pa(goal_block, 4581 - pa, cpa); 4429 + tmp_pa, cpa); 4582 4430 } 4583 - spin_unlock(&pa->pa_lock); 4431 + spin_unlock(&tmp_pa->pa_lock); 4584 4432 } 4585 4433 rcu_read_unlock(); 4586 4434 } ··· 4677 4525 } 4678 4526 } 4679 4527 4680 - static void ext4_mb_pa_callback(struct rcu_head *head) 4528 + static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa) 4681 4529 { 4682 - struct ext4_prealloc_space *pa; 4683 - pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); 4684 - 4530 + BUG_ON(!pa); 4685 4531 BUG_ON(atomic_read(&pa->pa_count)); 4686 4532 BUG_ON(pa->pa_deleted == 0); 4687 4533 kmem_cache_free(ext4_pspace_cachep, pa); 4534 + } 4535 + 4536 + static void ext4_mb_pa_callback(struct rcu_head *head) 4537 + { 4538 + struct ext4_prealloc_space *pa; 4539 + 4540 + pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); 4541 + ext4_mb_pa_free(pa); 4688 4542 } 4689 4543 4690 4544 /* ··· 4702 4544 { 4703 4545 ext4_group_t grp; 4704 4546 ext4_fsblk_t grp_blk; 4547 + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 4705 4548 4706 4549 /* in this short window concurrent discard can set pa_deleted */ 4707 4550 spin_lock(&pa->pa_lock); ··· 4747 4588 list_del(&pa->pa_group_list); 4748 4589 ext4_unlock_group(sb, grp); 4749 4590 4750 - spin_lock(pa->pa_obj_lock); 4751 - list_del_rcu(&pa->pa_inode_list); 4752 - spin_unlock(pa->pa_obj_lock); 4591 + if (pa->pa_type == MB_INODE_PA) { 4592 + write_lock(pa->pa_node_lock.inode_lock); 4593 + rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); 4594 + write_unlock(pa->pa_node_lock.inode_lock); 4595 + ext4_mb_pa_free(pa); 4596 + } else { 4597 + spin_lock(pa->pa_node_lock.lg_lock); 4598 + list_del_rcu(&pa->pa_node.lg_list); 4599 + spin_unlock(pa->pa_node_lock.lg_lock); 4600 + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4601 + } 4602 + } 4753 4603 4754 - call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4604 + static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new) 4605 + { 4606 + struct rb_node **iter = &root->rb_node, *parent = NULL; 4607 + struct ext4_prealloc_space *iter_pa, *new_pa; 4608 + ext4_lblk_t iter_start, new_start; 4609 + 4610 + while (*iter) { 4611 + iter_pa = rb_entry(*iter, struct ext4_prealloc_space, 4612 + pa_node.inode_node); 4613 + new_pa = rb_entry(new, struct ext4_prealloc_space, 4614 + pa_node.inode_node); 4615 + iter_start = iter_pa->pa_lstart; 4616 + new_start = new_pa->pa_lstart; 4617 + 4618 + parent = *iter; 4619 + if (new_start < iter_start) 4620 + iter = &((*iter)->rb_left); 4621 + else 4622 + iter = &((*iter)->rb_right); 4623 + } 4624 + 4625 + rb_link_node(new, parent, iter); 4626 + rb_insert_color(new, root); 4755 4627 } 4756 4628 4757 4629 /* ··· 4806 4616 pa = ac->ac_pa; 4807 4617 4808 4618 if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { 4809 - int winl; 4810 - int wins; 4811 - int win; 4812 - int offs; 4619 + int new_bex_start; 4620 + int new_bex_end; 4813 4621 4814 4622 /* we can't allocate as much as normalizer wants. 4815 4623 * so, found space must get proper lstart ··· 4815 4627 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); 4816 4628 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); 4817 4629 4818 - /* we're limited by original request in that 4819 - * logical block must be covered any way 4820 - * winl is window we can move our chunk within */ 4821 - winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; 4630 + /* 4631 + * Use the below logic for adjusting best extent as it keeps 4632 + * fragmentation in check while ensuring logical range of best 4633 + * extent doesn't overflow out of goal extent: 4634 + * 4635 + * 1. Check if best ex can be kept at end of goal and still 4636 + * cover original start 4637 + * 2. Else, check if best ex can be kept at start of goal and 4638 + * still cover original start 4639 + * 3. Else, keep the best ex at start of original request. 4640 + */ 4641 + new_bex_end = ac->ac_g_ex.fe_logical + 4642 + EXT4_C2B(sbi, ac->ac_g_ex.fe_len); 4643 + new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 4644 + if (ac->ac_o_ex.fe_logical >= new_bex_start) 4645 + goto adjust_bex; 4822 4646 4823 - /* also, we should cover whole original request */ 4824 - wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len); 4647 + new_bex_start = ac->ac_g_ex.fe_logical; 4648 + new_bex_end = 4649 + new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 4650 + if (ac->ac_o_ex.fe_logical < new_bex_end) 4651 + goto adjust_bex; 4825 4652 4826 - /* the smallest one defines real window */ 4827 - win = min(winl, wins); 4653 + new_bex_start = ac->ac_o_ex.fe_logical; 4654 + new_bex_end = 4655 + new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 4828 4656 4829 - offs = ac->ac_o_ex.fe_logical % 4830 - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 4831 - if (offs && offs < win) 4832 - win = offs; 4657 + adjust_bex: 4658 + ac->ac_b_ex.fe_logical = new_bex_start; 4833 4659 4834 - ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - 4835 - EXT4_NUM_B2C(sbi, win); 4836 4660 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); 4837 4661 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); 4662 + BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical + 4663 + EXT4_C2B(sbi, ac->ac_g_ex.fe_len))); 4838 4664 } 4839 - 4840 - /* preallocation can change ac_b_ex, thus we store actually 4841 - * allocated blocks for history */ 4842 - ac->ac_f_ex = ac->ac_b_ex; 4843 4665 4844 4666 pa->pa_lstart = ac->ac_b_ex.fe_logical; 4845 4667 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 4846 4668 pa->pa_len = ac->ac_b_ex.fe_len; 4847 4669 pa->pa_free = pa->pa_len; 4848 4670 spin_lock_init(&pa->pa_lock); 4849 - INIT_LIST_HEAD(&pa->pa_inode_list); 4850 4671 INIT_LIST_HEAD(&pa->pa_group_list); 4851 4672 pa->pa_deleted = 0; 4852 4673 pa->pa_type = MB_INODE_PA; ··· 4864 4667 pa->pa_len, pa->pa_lstart); 4865 4668 trace_ext4_mb_new_inode_pa(ac, pa); 4866 4669 4867 - ext4_mb_use_inode_pa(ac, pa); 4868 4670 atomic_add(pa->pa_free, &sbi->s_mb_preallocated); 4671 + ext4_mb_use_inode_pa(ac, pa); 4869 4672 4870 4673 ei = EXT4_I(ac->ac_inode); 4871 4674 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 4872 4675 4873 - pa->pa_obj_lock = &ei->i_prealloc_lock; 4676 + pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock; 4874 4677 pa->pa_inode = ac->ac_inode; 4875 4678 4876 4679 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 4877 4680 4878 - spin_lock(pa->pa_obj_lock); 4879 - list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); 4880 - spin_unlock(pa->pa_obj_lock); 4681 + write_lock(pa->pa_node_lock.inode_lock); 4682 + ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node); 4683 + write_unlock(pa->pa_node_lock.inode_lock); 4881 4684 atomic_inc(&ei->i_prealloc_active); 4882 4685 } 4883 4686 ··· 4900 4703 4901 4704 pa = ac->ac_pa; 4902 4705 4903 - /* preallocation can change ac_b_ex, thus we store actually 4904 - * allocated blocks for history */ 4905 - ac->ac_f_ex = ac->ac_b_ex; 4906 - 4907 4706 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 4908 4707 pa->pa_lstart = pa->pa_pstart; 4909 4708 pa->pa_len = ac->ac_b_ex.fe_len; 4910 4709 pa->pa_free = pa->pa_len; 4911 4710 spin_lock_init(&pa->pa_lock); 4912 - INIT_LIST_HEAD(&pa->pa_inode_list); 4711 + INIT_LIST_HEAD(&pa->pa_node.lg_list); 4913 4712 INIT_LIST_HEAD(&pa->pa_group_list); 4914 4713 pa->pa_deleted = 0; 4915 4714 pa->pa_type = MB_GROUP_PA; ··· 4921 4728 lg = ac->ac_lg; 4922 4729 BUG_ON(lg == NULL); 4923 4730 4924 - pa->pa_obj_lock = &lg->lg_prealloc_lock; 4731 + pa->pa_node_lock.lg_lock = &lg->lg_prealloc_lock; 4925 4732 pa->pa_inode = NULL; 4926 4733 4927 4734 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ··· 5039 4846 struct ext4_prealloc_space *pa, *tmp; 5040 4847 struct list_head list; 5041 4848 struct ext4_buddy e4b; 4849 + struct ext4_inode_info *ei; 5042 4850 int err; 5043 4851 int free = 0; 5044 4852 ··· 5098 4904 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 5099 4905 5100 4906 /* remove from object (inode or locality group) */ 5101 - spin_lock(pa->pa_obj_lock); 5102 - list_del_rcu(&pa->pa_inode_list); 5103 - spin_unlock(pa->pa_obj_lock); 5104 - 5105 - if (pa->pa_type == MB_GROUP_PA) 5106 - ext4_mb_release_group_pa(&e4b, pa); 5107 - else 5108 - ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); 4907 + if (pa->pa_type == MB_GROUP_PA) { 4908 + spin_lock(pa->pa_node_lock.lg_lock); 4909 + list_del_rcu(&pa->pa_node.lg_list); 4910 + spin_unlock(pa->pa_node_lock.lg_lock); 4911 + } else { 4912 + write_lock(pa->pa_node_lock.inode_lock); 4913 + ei = EXT4_I(pa->pa_inode); 4914 + rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); 4915 + write_unlock(pa->pa_node_lock.inode_lock); 4916 + } 5109 4917 5110 4918 list_del(&pa->u.pa_tmp_list); 5111 - call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4919 + 4920 + if (pa->pa_type == MB_GROUP_PA) { 4921 + ext4_mb_release_group_pa(&e4b, pa); 4922 + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4923 + } else { 4924 + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); 4925 + ext4_mb_pa_free(pa); 4926 + } 5112 4927 } 5113 4928 5114 4929 ext4_unlock_group(sb, group); ··· 5147 4944 ext4_group_t group = 0; 5148 4945 struct list_head list; 5149 4946 struct ext4_buddy e4b; 4947 + struct rb_node *iter; 5150 4948 int err; 5151 4949 5152 4950 if (!S_ISREG(inode->i_mode)) { 5153 - /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ 5154 4951 return; 5155 4952 } 5156 4953 ··· 5169 4966 5170 4967 repeat: 5171 4968 /* first, collect all pa's in the inode */ 5172 - spin_lock(&ei->i_prealloc_lock); 5173 - while (!list_empty(&ei->i_prealloc_list) && needed) { 5174 - pa = list_entry(ei->i_prealloc_list.prev, 5175 - struct ext4_prealloc_space, pa_inode_list); 5176 - BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); 4969 + write_lock(&ei->i_prealloc_lock); 4970 + for (iter = rb_first(&ei->i_prealloc_node); iter && needed; 4971 + iter = rb_next(iter)) { 4972 + pa = rb_entry(iter, struct ext4_prealloc_space, 4973 + pa_node.inode_node); 4974 + BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock); 4975 + 5177 4976 spin_lock(&pa->pa_lock); 5178 4977 if (atomic_read(&pa->pa_count)) { 5179 4978 /* this shouldn't happen often - nobody should 5180 4979 * use preallocation while we're discarding it */ 5181 4980 spin_unlock(&pa->pa_lock); 5182 - spin_unlock(&ei->i_prealloc_lock); 4981 + write_unlock(&ei->i_prealloc_lock); 5183 4982 ext4_msg(sb, KERN_ERR, 5184 4983 "uh-oh! used pa while discarding"); 5185 4984 WARN_ON(1); ··· 5192 4987 if (pa->pa_deleted == 0) { 5193 4988 ext4_mb_mark_pa_deleted(sb, pa); 5194 4989 spin_unlock(&pa->pa_lock); 5195 - list_del_rcu(&pa->pa_inode_list); 4990 + rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); 5196 4991 list_add(&pa->u.pa_tmp_list, &list); 5197 4992 needed--; 5198 4993 continue; ··· 5200 4995 5201 4996 /* someone is deleting pa right now */ 5202 4997 spin_unlock(&pa->pa_lock); 5203 - spin_unlock(&ei->i_prealloc_lock); 4998 + write_unlock(&ei->i_prealloc_lock); 5204 4999 5205 5000 /* we have to wait here because pa_deleted 5206 5001 * doesn't mean pa is already unlinked from ··· 5217 5012 schedule_timeout_uninterruptible(HZ); 5218 5013 goto repeat; 5219 5014 } 5220 - spin_unlock(&ei->i_prealloc_lock); 5015 + write_unlock(&ei->i_prealloc_lock); 5221 5016 5222 5017 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 5223 5018 BUG_ON(pa->pa_type != MB_INODE_PA); ··· 5249 5044 put_bh(bitmap_bh); 5250 5045 5251 5046 list_del(&pa->u.pa_tmp_list); 5252 - call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 5047 + ext4_mb_pa_free(pa); 5253 5048 } 5254 5049 } 5255 5050 ··· 5266 5061 return 0; 5267 5062 } 5268 5063 5269 - static void ext4_mb_pa_free(struct ext4_allocation_context *ac) 5064 + static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac) 5270 5065 { 5271 5066 struct ext4_prealloc_space *pa = ac->ac_pa; 5272 5067 5273 5068 BUG_ON(!pa); 5274 5069 ac->ac_pa = NULL; 5275 5070 WARN_ON(!atomic_dec_and_test(&pa->pa_count)); 5276 - kmem_cache_free(ext4_pspace_cachep, pa); 5071 + /* 5072 + * current function is only called due to an error or due to 5073 + * len of found blocks < len of requested blocks hence the PA has not 5074 + * been added to grp->bb_prealloc_list. So we don't need to lock it 5075 + */ 5076 + pa->pa_deleted = 1; 5077 + ext4_mb_pa_free(pa); 5277 5078 } 5278 5079 5279 5080 #ifdef CONFIG_EXT4_DEBUG ··· 5482 5271 5483 5272 spin_lock(&lg->lg_prealloc_lock); 5484 5273 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], 5485 - pa_inode_list, 5274 + pa_node.lg_list, 5486 5275 lockdep_is_held(&lg->lg_prealloc_lock)) { 5487 5276 spin_lock(&pa->pa_lock); 5488 5277 if (atomic_read(&pa->pa_count)) { ··· 5505 5294 ext4_mb_mark_pa_deleted(sb, pa); 5506 5295 spin_unlock(&pa->pa_lock); 5507 5296 5508 - list_del_rcu(&pa->pa_inode_list); 5297 + list_del_rcu(&pa->pa_node.lg_list); 5509 5298 list_add(&pa->u.pa_tmp_list, &discard_list); 5510 5299 5511 5300 total_entries--; ··· 5566 5355 /* Add the prealloc space to lg */ 5567 5356 spin_lock(&lg->lg_prealloc_lock); 5568 5357 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], 5569 - pa_inode_list, 5358 + pa_node.lg_list, 5570 5359 lockdep_is_held(&lg->lg_prealloc_lock)) { 5571 5360 spin_lock(&tmp_pa->pa_lock); 5572 5361 if (tmp_pa->pa_deleted) { ··· 5575 5364 } 5576 5365 if (!added && pa->pa_free < tmp_pa->pa_free) { 5577 5366 /* Add to the tail of the previous entry */ 5578 - list_add_tail_rcu(&pa->pa_inode_list, 5579 - &tmp_pa->pa_inode_list); 5367 + list_add_tail_rcu(&pa->pa_node.lg_list, 5368 + &tmp_pa->pa_node.lg_list); 5580 5369 added = 1; 5581 5370 /* 5582 5371 * we want to count the total ··· 5587 5376 lg_prealloc_count++; 5588 5377 } 5589 5378 if (!added) 5590 - list_add_tail_rcu(&pa->pa_inode_list, 5379 + list_add_tail_rcu(&pa->pa_node.lg_list, 5591 5380 &lg->lg_prealloc_list[order]); 5592 5381 spin_unlock(&lg->lg_prealloc_lock); 5593 5382 ··· 5601 5390 } 5602 5391 5603 5392 /* 5604 - * if per-inode prealloc list is too long, trim some PA 5605 - */ 5606 - static void ext4_mb_trim_inode_pa(struct inode *inode) 5607 - { 5608 - struct ext4_inode_info *ei = EXT4_I(inode); 5609 - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5610 - int count, delta; 5611 - 5612 - count = atomic_read(&ei->i_prealloc_active); 5613 - delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1; 5614 - if (count > sbi->s_mb_max_inode_prealloc + delta) { 5615 - count -= sbi->s_mb_max_inode_prealloc; 5616 - ext4_discard_preallocations(inode, count); 5617 - } 5618 - } 5619 - 5620 - /* 5621 5393 * release all resource we used in allocation 5622 5394 */ 5623 5395 static int ext4_mb_release_context(struct ext4_allocation_context *ac) 5624 5396 { 5625 - struct inode *inode = ac->ac_inode; 5626 - struct ext4_inode_info *ei = EXT4_I(inode); 5627 5397 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 5628 5398 struct ext4_prealloc_space *pa = ac->ac_pa; 5629 5399 if (pa) { ··· 5624 5432 * doesn't grow big. 5625 5433 */ 5626 5434 if (likely(pa->pa_free)) { 5627 - spin_lock(pa->pa_obj_lock); 5628 - list_del_rcu(&pa->pa_inode_list); 5629 - spin_unlock(pa->pa_obj_lock); 5435 + spin_lock(pa->pa_node_lock.lg_lock); 5436 + list_del_rcu(&pa->pa_node.lg_list); 5437 + spin_unlock(pa->pa_node_lock.lg_lock); 5630 5438 ext4_mb_add_n_trim(ac); 5631 5439 } 5632 - } 5633 - 5634 - if (pa->pa_type == MB_INODE_PA) { 5635 - /* 5636 - * treat per-inode prealloc list as a lru list, then try 5637 - * to trim the least recently used PA. 5638 - */ 5639 - spin_lock(pa->pa_obj_lock); 5640 - list_move(&pa->pa_inode_list, &ei->i_prealloc_list); 5641 - spin_unlock(pa->pa_obj_lock); 5642 5440 } 5643 5441 5644 5442 ext4_mb_put_pa(ac, ac->ac_sb, pa); ··· 5640 5458 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 5641 5459 mutex_unlock(&ac->ac_lg->lg_mutex); 5642 5460 ext4_mb_collect_stats(ac); 5643 - ext4_mb_trim_inode_pa(inode); 5644 5461 return 0; 5645 5462 } 5646 5463 ··· 5792 5611 * So we have to free this pa here itself. 5793 5612 */ 5794 5613 if (*errp) { 5795 - ext4_mb_pa_free(ac); 5614 + ext4_mb_pa_put_free(ac); 5796 5615 ext4_discard_allocated_blocks(ac); 5797 5616 goto errout; 5798 5617 } 5799 5618 if (ac->ac_status == AC_STATUS_FOUND && 5800 5619 ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len) 5801 - ext4_mb_pa_free(ac); 5620 + ext4_mb_pa_put_free(ac); 5802 5621 } 5803 5622 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 5804 5623 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); ··· 5817 5636 * If block allocation fails then the pa allocated above 5818 5637 * needs to be freed here itself. 5819 5638 */ 5820 - ext4_mb_pa_free(ac); 5639 + ext4_mb_pa_put_free(ac); 5821 5640 *errp = -ENOSPC; 5822 5641 } 5823 5642 5824 - errout: 5825 5643 if (*errp) { 5644 + errout: 5826 5645 ac->ac_b_ex.fe_len = 0; 5827 5646 ar->len = 0; 5828 5647 ext4_mb_show_ac(ac); 5829 5648 } 5830 5649 ext4_mb_release_context(ac); 5650 + kmem_cache_free(ext4_ac_cachep, ac); 5831 5651 out: 5832 - if (ac) 5833 - kmem_cache_free(ext4_ac_cachep, ac); 5834 5652 if (inquota && ar->len < inquota) 5835 5653 dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); 5836 5654 if (!ar->len) { ··· 5873 5693 kmem_cache_free(ext4_free_data_cachep, entry); 5874 5694 } 5875 5695 5876 - static noinline_for_stack int 5696 + static noinline_for_stack void 5877 5697 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, 5878 5698 struct ext4_free_data *new_entry) 5879 5699 { ··· 5916 5736 EXT4_C2B(sbi, cluster), 5917 5737 "Block already on to-be-freed list"); 5918 5738 kmem_cache_free(ext4_free_data_cachep, new_entry); 5919 - return 0; 5739 + return; 5920 5740 } 5921 5741 } 5922 5742 ··· 5942 5762 list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list); 5943 5763 sbi->s_mb_free_pending += clusters; 5944 5764 spin_unlock(&sbi->s_md_lock); 5945 - return 0; 5946 5765 } 5947 5766 5948 5767 /* ··· 5976 5797 return 0; 5977 5798 } 5978 5799 5979 - ext4_get_group_no_and_offset(sb, 5980 - max(ext4_group_first_block_no(sb, group), goal), 5981 - NULL, &blkoff); 5982 5800 while (1) { 5983 5801 i = mb_find_next_zero_bit(bitmap_bh->b_data, max, 5984 5802 blkoff); ··· 5990 5814 brelse(bitmap_bh); 5991 5815 if (i < max) 5992 5816 break; 5817 + 5818 + blkoff = 0; 5993 5819 } 5994 5820 5995 5821 if (group >= ext4_get_groups_count(sb) || i >= max) { ··· 6020 5842 ext4_get_group_no_and_offset(sb, block, &group, &blkoff); 6021 5843 bitmap_bh = ext4_read_block_bitmap(sb, group); 6022 5844 if (IS_ERR(bitmap_bh)) { 6023 - err = PTR_ERR(bitmap_bh); 6024 5845 pr_warn("Failed to read block bitmap\n"); 6025 5846 return; 6026 5847 } 6027 5848 gdp = ext4_get_group_desc(sb, group, &gdp_bh); 6028 5849 if (!gdp) 6029 - return; 5850 + goto err_out; 6030 5851 6031 5852 for (i = 0; i < count; i++) { 6032 5853 if (!mb_test_bit(blkoff + i, bitmap_bh->b_data)) ··· 6034 5857 mb_clear_bits(bitmap_bh->b_data, blkoff, count); 6035 5858 err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); 6036 5859 if (err) 6037 - return; 5860 + goto err_out; 6038 5861 ext4_free_group_clusters_set( 6039 5862 sb, gdp, ext4_free_group_clusters(sb, gdp) + 6040 5863 count - already_freed); 6041 - ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh); 5864 + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); 6042 5865 ext4_group_desc_csum_set(sb, group, gdp); 6043 5866 ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); 6044 5867 sync_dirty_buffer(bitmap_bh); 6045 5868 sync_dirty_buffer(gdp_bh); 5869 + 5870 + err_out: 6046 5871 brelse(bitmap_bh); 6047 5872 } 6048 5873 ··· 6202 6023 6203 6024 ret = ext4_free_group_clusters(sb, gdp) + count_clusters; 6204 6025 ext4_free_group_clusters_set(sb, gdp, ret); 6205 - ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh); 6026 + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); 6206 6027 ext4_group_desc_csum_set(sb, block_group, gdp); 6207 6028 ext4_unlock_group(sb, block_group); 6208 6029 ··· 6459 6280 free_clusters_count = clusters_freed + 6460 6281 ext4_free_group_clusters(sb, desc); 6461 6282 ext4_free_group_clusters_set(sb, desc, free_clusters_count); 6462 - ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh); 6283 + ext4_block_bitmap_csum_set(sb, desc, bitmap_bh); 6463 6284 ext4_group_desc_csum_set(sb, block_group, desc); 6464 6285 ext4_unlock_group(sb, block_group); 6465 6286 percpu_counter_add(&sbi->s_freeclusters_counter,
+9 -8
fs/ext4/mballoc.h
··· 74 74 #define MB_DEFAULT_GROUP_PREALLOC 512 75 75 76 76 /* 77 - * maximum length of inode prealloc list 78 - */ 79 - #define MB_DEFAULT_MAX_INODE_PREALLOC 512 80 - 81 - /* 82 77 * Number of groups to search linearly before performing group scanning 83 78 * optimization. 84 79 */ ··· 109 114 }; 110 115 111 116 struct ext4_prealloc_space { 112 - struct list_head pa_inode_list; 117 + union { 118 + struct rb_node inode_node; /* for inode PA rbtree */ 119 + struct list_head lg_list; /* for lg PAs */ 120 + } pa_node; 113 121 struct list_head pa_group_list; 114 122 union { 115 123 struct list_head pa_tmp_list; ··· 126 128 ext4_grpblk_t pa_len; /* len of preallocated chunk */ 127 129 ext4_grpblk_t pa_free; /* how many blocks are free */ 128 130 unsigned short pa_type; /* pa type. inode or group */ 129 - spinlock_t *pa_obj_lock; 130 - struct inode *pa_inode; /* hack, for history only */ 131 + union { 132 + rwlock_t *inode_lock; /* locks the rbtree holding this PA */ 133 + spinlock_t *lg_lock; /* locks the lg list holding this PA */ 134 + } pa_node_lock; 135 + struct inode *pa_inode; /* used to get the inode during group discard */ 131 136 }; 132 137 133 138 enum {
+17 -16
fs/ext4/move_extent.c
··· 126 126 { 127 127 struct address_space *mapping[2]; 128 128 unsigned int flags; 129 - unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; 130 129 131 130 BUG_ON(!inode1 || !inode2); 132 131 if (inode1 < inode2) { ··· 138 139 } 139 140 140 141 flags = memalloc_nofs_save(); 141 - folio[0] = __filemap_get_folio(mapping[0], index1, fgp_flags, 142 + folio[0] = __filemap_get_folio(mapping[0], index1, FGP_WRITEBEGIN, 142 143 mapping_gfp_mask(mapping[0])); 143 144 if (!folio[0]) { 144 145 memalloc_nofs_restore(flags); 145 146 return -ENOMEM; 146 147 } 147 148 148 - folio[1] = __filemap_get_folio(mapping[1], index2, fgp_flags, 149 + folio[1] = __filemap_get_folio(mapping[1], index2, FGP_WRITEBEGIN, 149 150 mapping_gfp_mask(mapping[1])); 150 151 memalloc_nofs_restore(flags); 151 152 if (!folio[1]) { ··· 168 169 169 170 /* Force page buffers uptodate w/o dropping page's lock */ 170 171 static int 171 - mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) 172 + mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to) 172 173 { 173 - struct inode *inode = page->mapping->host; 174 + struct inode *inode = folio->mapping->host; 174 175 sector_t block; 175 176 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; 176 177 unsigned int blocksize, block_start, block_end; 177 178 int i, err, nr = 0, partial = 0; 178 - BUG_ON(!PageLocked(page)); 179 - BUG_ON(PageWriteback(page)); 179 + BUG_ON(!folio_test_locked(folio)); 180 + BUG_ON(folio_test_writeback(folio)); 180 181 181 - if (PageUptodate(page)) 182 + if (folio_test_uptodate(folio)) 182 183 return 0; 183 184 184 185 blocksize = i_blocksize(inode); 185 - if (!page_has_buffers(page)) 186 - create_empty_buffers(page, blocksize, 0); 186 + head = folio_buffers(folio); 187 + if (!head) { 188 + create_empty_buffers(&folio->page, blocksize, 0); 189 + head = folio_buffers(folio); 190 + } 187 191 188 - head = page_buffers(page); 189 - block = (sector_t)page->index << (PAGE_SHIFT - inode->i_blkbits); 192 + block = (sector_t)folio->index << (PAGE_SHIFT - inode->i_blkbits); 190 193 for (bh = head, block_start = 0; bh != head || !block_start; 191 194 block++, block_start = block_end, bh = bh->b_this_page) { 192 195 block_end = block_start + blocksize; ··· 202 201 if (!buffer_mapped(bh)) { 203 202 err = ext4_get_block(inode, block, bh, 0); 204 203 if (err) { 205 - SetPageError(page); 204 + folio_set_error(folio); 206 205 return err; 207 206 } 208 207 if (!buffer_mapped(bh)) { 209 - zero_user(page, block_start, blocksize); 208 + folio_zero_range(folio, block_start, blocksize); 210 209 set_buffer_uptodate(bh); 211 210 continue; 212 211 } ··· 228 227 } 229 228 out: 230 229 if (!partial) 231 - SetPageUptodate(page); 230 + folio_mark_uptodate(folio); 232 231 return 0; 233 232 } 234 233 ··· 356 355 goto unlock_folios; 357 356 } 358 357 data_copy: 359 - *err = mext_page_mkuptodate(&folio[0]->page, from, from + replaced_size); 358 + *err = mext_page_mkuptodate(folio[0], from, from + replaced_size); 360 359 if (*err) 361 360 goto unlock_folios; 362 361
+56 -60
fs/ext4/page-io.c
··· 99 99 100 100 static void ext4_finish_bio(struct bio *bio) 101 101 { 102 - struct bio_vec *bvec; 103 - struct bvec_iter_all iter_all; 102 + struct folio_iter fi; 104 103 105 - bio_for_each_segment_all(bvec, bio, iter_all) { 106 - struct page *page = bvec->bv_page; 107 - struct page *bounce_page = NULL; 104 + bio_for_each_folio_all(fi, bio) { 105 + struct folio *folio = fi.folio; 106 + struct folio *io_folio = NULL; 108 107 struct buffer_head *bh, *head; 109 - unsigned bio_start = bvec->bv_offset; 110 - unsigned bio_end = bio_start + bvec->bv_len; 108 + size_t bio_start = fi.offset; 109 + size_t bio_end = bio_start + fi.length; 111 110 unsigned under_io = 0; 112 111 unsigned long flags; 113 112 114 - if (fscrypt_is_bounce_page(page)) { 115 - bounce_page = page; 116 - page = fscrypt_pagecache_page(bounce_page); 113 + if (fscrypt_is_bounce_folio(folio)) { 114 + io_folio = folio; 115 + folio = fscrypt_pagecache_folio(folio); 117 116 } 118 117 119 118 if (bio->bi_status) { 120 - SetPageError(page); 121 - mapping_set_error(page->mapping, -EIO); 119 + int err = blk_status_to_errno(bio->bi_status); 120 + folio_set_error(folio); 121 + mapping_set_error(folio->mapping, err); 122 122 } 123 - bh = head = page_buffers(page); 123 + bh = head = folio_buffers(folio); 124 124 /* 125 - * We check all buffers in the page under b_uptodate_lock 125 + * We check all buffers in the folio under b_uptodate_lock 126 126 * to avoid races with other end io clearing async_write flags 127 127 */ 128 128 spin_lock_irqsave(&head->b_uptodate_lock, flags); ··· 141 141 } while ((bh = bh->b_this_page) != head); 142 142 spin_unlock_irqrestore(&head->b_uptodate_lock, flags); 143 143 if (!under_io) { 144 - fscrypt_free_bounce_page(bounce_page); 145 - end_page_writeback(page); 144 + fscrypt_free_bounce_page(&io_folio->page); 145 + folio_end_writeback(folio); 146 146 } 147 147 } 148 148 } ··· 409 409 410 410 static void io_submit_add_bh(struct ext4_io_submit *io, 411 411 struct inode *inode, 412 - struct page *pagecache_page, 413 - struct page *bounce_page, 412 + struct folio *folio, 413 + struct folio *io_folio, 414 414 struct buffer_head *bh) 415 415 { 416 - int ret; 417 - 418 416 if (io->io_bio && (bh->b_blocknr != io->io_next_block || 419 417 !fscrypt_mergeable_bio_bh(io->io_bio, bh))) { 420 418 submit_and_retry: ··· 420 422 } 421 423 if (io->io_bio == NULL) 422 424 io_submit_init_bio(io, bh); 423 - ret = bio_add_page(io->io_bio, bounce_page ?: pagecache_page, 424 - bh->b_size, bh_offset(bh)); 425 - if (ret != bh->b_size) 425 + if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh))) 426 426 goto submit_and_retry; 427 - wbc_account_cgroup_owner(io->io_wbc, pagecache_page, bh->b_size); 427 + wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size); 428 428 io->io_next_block++; 429 429 } 430 430 431 - int ext4_bio_write_page(struct ext4_io_submit *io, 432 - struct page *page, 433 - int len) 431 + int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, 432 + size_t len) 434 433 { 435 - struct page *bounce_page = NULL; 436 - struct inode *inode = page->mapping->host; 434 + struct folio *io_folio = folio; 435 + struct inode *inode = folio->mapping->host; 437 436 unsigned block_start; 438 437 struct buffer_head *bh, *head; 439 438 int ret = 0; ··· 438 443 struct writeback_control *wbc = io->io_wbc; 439 444 bool keep_towrite = false; 440 445 441 - BUG_ON(!PageLocked(page)); 442 - BUG_ON(PageWriteback(page)); 446 + BUG_ON(!folio_test_locked(folio)); 447 + BUG_ON(folio_test_writeback(folio)); 443 448 444 - ClearPageError(page); 449 + folio_clear_error(folio); 445 450 446 451 /* 447 452 * Comments copied from block_write_full_page: 448 453 * 449 - * The page straddles i_size. It must be zeroed out on each and every 454 + * The folio straddles i_size. It must be zeroed out on each and every 450 455 * writepage invocation because it may be mmapped. "A file is mapped 451 456 * in multiples of the page size. For a file that is not a multiple of 452 457 * the page size, the remaining memory is zeroed when mapped, and 453 458 * writes to that region are not written out to the file." 454 459 */ 455 - if (len < PAGE_SIZE) 456 - zero_user_segment(page, len, PAGE_SIZE); 460 + if (len < folio_size(folio)) 461 + folio_zero_segment(folio, len, folio_size(folio)); 457 462 /* 458 463 * In the first loop we prepare and mark buffers to submit. We have to 459 - * mark all buffers in the page before submitting so that 460 - * end_page_writeback() cannot be called from ext4_end_bio() when IO 464 + * mark all buffers in the folio before submitting so that 465 + * folio_end_writeback() cannot be called from ext4_end_bio() when IO 461 466 * on the first buffer finishes and we are still working on submitting 462 467 * the second buffer. 463 468 */ 464 - bh = head = page_buffers(page); 469 + bh = head = folio_buffers(folio); 465 470 do { 466 471 block_start = bh_offset(bh); 467 472 if (block_start >= len) { ··· 476 481 clear_buffer_dirty(bh); 477 482 /* 478 483 * Keeping dirty some buffer we cannot write? Make sure 479 - * to redirty the page and keep TOWRITE tag so that 480 - * racing WB_SYNC_ALL writeback does not skip the page. 484 + * to redirty the folio and keep TOWRITE tag so that 485 + * racing WB_SYNC_ALL writeback does not skip the folio. 481 486 * This happens e.g. when doing writeout for 482 - * transaction commit. 487 + * transaction commit or when journalled data is not 488 + * yet committed. 483 489 */ 484 - if (buffer_dirty(bh)) { 485 - if (!PageDirty(page)) 486 - redirty_page_for_writepage(wbc, page); 490 + if (buffer_dirty(bh) || 491 + (buffer_jbd(bh) && buffer_jbddirty(bh))) { 492 + if (!folio_test_dirty(folio)) 493 + folio_redirty_for_writepage(wbc, folio); 487 494 keep_towrite = true; 488 495 } 489 496 continue; ··· 497 500 nr_to_submit++; 498 501 } while ((bh = bh->b_this_page) != head); 499 502 500 - /* Nothing to submit? Just unlock the page... */ 503 + /* Nothing to submit? Just unlock the folio... */ 501 504 if (!nr_to_submit) 502 - goto unlock; 505 + return 0; 503 506 504 - bh = head = page_buffers(page); 507 + bh = head = folio_buffers(folio); 505 508 506 509 /* 507 510 * If any blocks are being written to an encrypted file, encrypt them ··· 510 513 * (e.g. holes) to be unnecessarily encrypted, but this is rare and 511 514 * can't happen in the common case of blocksize == PAGE_SIZE. 512 515 */ 513 - if (fscrypt_inode_uses_fs_layer_crypto(inode) && nr_to_submit) { 516 + if (fscrypt_inode_uses_fs_layer_crypto(inode)) { 514 517 gfp_t gfp_flags = GFP_NOFS; 515 518 unsigned int enc_bytes = round_up(len, i_blocksize(inode)); 519 + struct page *bounce_page; 516 520 517 521 /* 518 522 * Since bounce page allocation uses a mempool, we can only use ··· 523 525 if (io->io_bio) 524 526 gfp_flags = GFP_NOWAIT | __GFP_NOWARN; 525 527 retry_encrypt: 526 - bounce_page = fscrypt_encrypt_pagecache_blocks(page, enc_bytes, 527 - 0, gfp_flags); 528 + bounce_page = fscrypt_encrypt_pagecache_blocks(&folio->page, 529 + enc_bytes, 0, gfp_flags); 528 530 if (IS_ERR(bounce_page)) { 529 531 ret = PTR_ERR(bounce_page); 530 532 if (ret == -ENOMEM && ··· 540 542 } 541 543 542 544 printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); 543 - redirty_page_for_writepage(wbc, page); 545 + folio_redirty_for_writepage(wbc, folio); 544 546 do { 545 547 if (buffer_async_write(bh)) { 546 548 clear_buffer_async_write(bh); ··· 548 550 } 549 551 bh = bh->b_this_page; 550 552 } while (bh != head); 551 - goto unlock; 553 + 554 + return ret; 552 555 } 556 + io_folio = page_folio(bounce_page); 553 557 } 554 558 555 - if (keep_towrite) 556 - set_page_writeback_keepwrite(page); 557 - else 558 - set_page_writeback(page); 559 + __folio_start_writeback(folio, keep_towrite); 559 560 560 561 /* Now submit buffers to write */ 561 562 do { 562 563 if (!buffer_async_write(bh)) 563 564 continue; 564 - io_submit_add_bh(io, inode, page, bounce_page, bh); 565 + io_submit_add_bh(io, inode, folio, io_folio, bh); 565 566 } while ((bh = bh->b_this_page) != head); 566 - unlock: 567 - unlock_page(page); 568 - return ret; 567 + 568 + return 0; 569 569 }
+34 -38
fs/ext4/readpage.c
··· 68 68 69 69 static void __read_end_io(struct bio *bio) 70 70 { 71 - struct page *page; 72 - struct bio_vec *bv; 73 - struct bvec_iter_all iter_all; 71 + struct folio_iter fi; 74 72 75 - bio_for_each_segment_all(bv, bio, iter_all) { 76 - page = bv->bv_page; 73 + bio_for_each_folio_all(fi, bio) { 74 + struct folio *folio = fi.folio; 77 75 78 76 if (bio->bi_status) 79 - ClearPageUptodate(page); 77 + folio_clear_uptodate(folio); 80 78 else 81 - SetPageUptodate(page); 82 - unlock_page(page); 79 + folio_mark_uptodate(folio); 80 + folio_unlock(folio); 83 81 } 84 82 if (bio->bi_private) 85 83 mempool_free(bio->bi_private, bio_post_read_ctx_pool); ··· 216 218 } 217 219 218 220 int ext4_mpage_readpages(struct inode *inode, 219 - struct readahead_control *rac, struct page *page) 221 + struct readahead_control *rac, struct folio *folio) 220 222 { 221 223 struct bio *bio = NULL; 222 224 sector_t last_block_in_bio = 0; ··· 245 247 int fully_mapped = 1; 246 248 unsigned first_hole = blocks_per_page; 247 249 248 - if (rac) { 249 - page = readahead_page(rac); 250 - prefetchw(&page->flags); 251 - } 250 + if (rac) 251 + folio = readahead_folio(rac); 252 + prefetchw(&folio->flags); 252 253 253 - if (page_has_buffers(page)) 254 + if (folio_buffers(folio)) 254 255 goto confused; 255 256 256 257 block_in_file = next_block = 257 - (sector_t)page->index << (PAGE_SHIFT - blkbits); 258 + (sector_t)folio->index << (PAGE_SHIFT - blkbits); 258 259 last_block = block_in_file + nr_pages * blocks_per_page; 259 260 last_block_in_file = (ext4_readpage_limit(inode) + 260 261 blocksize - 1) >> blkbits; ··· 287 290 288 291 /* 289 292 * Then do more ext4_map_blocks() calls until we are 290 - * done with this page. 293 + * done with this folio. 291 294 */ 292 295 while (page_block < blocks_per_page) { 293 296 if (block_in_file < last_block) { ··· 296 299 297 300 if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { 298 301 set_error_page: 299 - SetPageError(page); 300 - zero_user_segment(page, 0, 301 - PAGE_SIZE); 302 - unlock_page(page); 302 + folio_set_error(folio); 303 + folio_zero_segment(folio, 0, 304 + folio_size(folio)); 305 + folio_unlock(folio); 303 306 goto next_page; 304 307 } 305 308 } ··· 330 333 } 331 334 } 332 335 if (first_hole != blocks_per_page) { 333 - zero_user_segment(page, first_hole << blkbits, 334 - PAGE_SIZE); 336 + folio_zero_segment(folio, first_hole << blkbits, 337 + folio_size(folio)); 335 338 if (first_hole == 0) { 336 - if (ext4_need_verity(inode, page->index) && 337 - !fsverity_verify_page(page)) 339 + if (ext4_need_verity(inode, folio->index) && 340 + !fsverity_verify_page(&folio->page)) 338 341 goto set_error_page; 339 - SetPageUptodate(page); 340 - unlock_page(page); 341 - goto next_page; 342 + folio_mark_uptodate(folio); 343 + folio_unlock(folio); 344 + continue; 342 345 } 343 346 } else if (fully_mapped) { 344 - SetPageMappedToDisk(page); 347 + folio_set_mappedtodisk(folio); 345 348 } 346 349 347 350 /* 348 - * This page will go to BIO. Do we need to send this 351 + * This folio will go to BIO. Do we need to send this 349 352 * BIO off first? 350 353 */ 351 354 if (bio && (last_block_in_bio != blocks[0] - 1 || ··· 363 366 REQ_OP_READ, GFP_KERNEL); 364 367 fscrypt_set_bio_crypt_ctx(bio, inode, next_block, 365 368 GFP_KERNEL); 366 - ext4_set_bio_post_read_ctx(bio, inode, page->index); 369 + ext4_set_bio_post_read_ctx(bio, inode, folio->index); 367 370 bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); 368 371 bio->bi_end_io = mpage_end_io; 369 372 if (rac) ··· 371 374 } 372 375 373 376 length = first_hole << blkbits; 374 - if (bio_add_page(bio, page, length, 0) < length) 377 + if (!bio_add_folio(bio, folio, length, 0)) 375 378 goto submit_and_realloc; 376 379 377 380 if (((map.m_flags & EXT4_MAP_BOUNDARY) && ··· 381 384 bio = NULL; 382 385 } else 383 386 last_block_in_bio = blocks[blocks_per_page - 1]; 384 - goto next_page; 387 + continue; 385 388 confused: 386 389 if (bio) { 387 390 submit_bio(bio); 388 391 bio = NULL; 389 392 } 390 - if (!PageUptodate(page)) 391 - block_read_full_folio(page_folio(page), ext4_get_block); 393 + if (!folio_test_uptodate(folio)) 394 + block_read_full_folio(folio, ext4_get_block); 392 395 else 393 - unlock_page(page); 394 - next_page: 395 - if (rac) 396 - put_page(page); 396 + folio_unlock(folio); 397 + next_page: 398 + ; /* A label shall be followed by a statement until C23 */ 397 399 } 398 400 if (bio) 399 401 submit_bio(bio);
+3 -4
fs/ext4/resize.c
··· 1306 1306 } 1307 1307 1308 1308 static int ext4_set_bitmap_checksums(struct super_block *sb, 1309 - ext4_group_t group, 1310 1309 struct ext4_group_desc *gdp, 1311 1310 struct ext4_new_group_data *group_data) 1312 1311 { ··· 1317 1318 bh = ext4_get_bitmap(sb, group_data->inode_bitmap); 1318 1319 if (!bh) 1319 1320 return -EIO; 1320 - ext4_inode_bitmap_csum_set(sb, group, gdp, bh, 1321 + ext4_inode_bitmap_csum_set(sb, gdp, bh, 1321 1322 EXT4_INODES_PER_GROUP(sb) / 8); 1322 1323 brelse(bh); 1323 1324 1324 1325 bh = ext4_get_bitmap(sb, group_data->block_bitmap); 1325 1326 if (!bh) 1326 1327 return -EIO; 1327 - ext4_block_bitmap_csum_set(sb, group, gdp, bh); 1328 + ext4_block_bitmap_csum_set(sb, gdp, bh); 1328 1329 brelse(bh); 1329 1330 1330 1331 return 0; ··· 1362 1363 memset(gdp, 0, EXT4_DESC_SIZE(sb)); 1363 1364 ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap); 1364 1365 ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap); 1365 - err = ext4_set_bitmap_checksums(sb, group, gdp, group_data); 1366 + err = ext4_set_bitmap_checksums(sb, gdp, group_data); 1366 1367 if (err) { 1367 1368 ext4_std_error(sb, err); 1368 1369 break;
+211 -202
fs/ext4/super.c
··· 1183 1183 } 1184 1184 #endif 1185 1185 1186 + static int ext4_percpu_param_init(struct ext4_sb_info *sbi) 1187 + { 1188 + ext4_fsblk_t block; 1189 + int err; 1190 + 1191 + block = ext4_count_free_clusters(sbi->s_sb); 1192 + ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); 1193 + err = percpu_counter_init(&sbi->s_freeclusters_counter, block, 1194 + GFP_KERNEL); 1195 + if (!err) { 1196 + unsigned long freei = ext4_count_free_inodes(sbi->s_sb); 1197 + sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); 1198 + err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, 1199 + GFP_KERNEL); 1200 + } 1201 + if (!err) 1202 + err = percpu_counter_init(&sbi->s_dirs_counter, 1203 + ext4_count_dirs(sbi->s_sb), GFP_KERNEL); 1204 + if (!err) 1205 + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, 1206 + GFP_KERNEL); 1207 + if (!err) 1208 + err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, 1209 + GFP_KERNEL); 1210 + if (!err) 1211 + err = percpu_init_rwsem(&sbi->s_writepages_rwsem); 1212 + 1213 + if (err) 1214 + ext4_msg(sbi->s_sb, KERN_ERR, "insufficient memory"); 1215 + 1216 + return err; 1217 + } 1218 + 1219 + static void ext4_percpu_param_destroy(struct ext4_sb_info *sbi) 1220 + { 1221 + percpu_counter_destroy(&sbi->s_freeclusters_counter); 1222 + percpu_counter_destroy(&sbi->s_freeinodes_counter); 1223 + percpu_counter_destroy(&sbi->s_dirs_counter); 1224 + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); 1225 + percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); 1226 + percpu_free_rwsem(&sbi->s_writepages_rwsem); 1227 + } 1228 + 1229 + static void ext4_group_desc_free(struct ext4_sb_info *sbi) 1230 + { 1231 + struct buffer_head **group_desc; 1232 + int i; 1233 + 1234 + rcu_read_lock(); 1235 + group_desc = rcu_dereference(sbi->s_group_desc); 1236 + for (i = 0; i < sbi->s_gdb_count; i++) 1237 + brelse(group_desc[i]); 1238 + kvfree(group_desc); 1239 + rcu_read_unlock(); 1240 + } 1241 + 1242 + static void ext4_flex_groups_free(struct ext4_sb_info *sbi) 1243 + { 1244 + struct flex_groups **flex_groups; 1245 + int i; 1246 + 1247 + rcu_read_lock(); 1248 + flex_groups = rcu_dereference(sbi->s_flex_groups); 1249 + if (flex_groups) { 1250 + for (i = 0; i < sbi->s_flex_groups_allocated; i++) 1251 + kvfree(flex_groups[i]); 1252 + kvfree(flex_groups); 1253 + } 1254 + rcu_read_unlock(); 1255 + } 1256 + 1186 1257 static void ext4_put_super(struct super_block *sb) 1187 1258 { 1188 1259 struct ext4_sb_info *sbi = EXT4_SB(sb); 1189 1260 struct ext4_super_block *es = sbi->s_es; 1190 - struct buffer_head **group_desc; 1191 - struct flex_groups **flex_groups; 1192 1261 int aborted = 0; 1193 1262 int i, err; 1194 1263 ··· 1307 1238 if (!sb_rdonly(sb)) 1308 1239 ext4_commit_super(sb); 1309 1240 1310 - rcu_read_lock(); 1311 - group_desc = rcu_dereference(sbi->s_group_desc); 1312 - for (i = 0; i < sbi->s_gdb_count; i++) 1313 - brelse(group_desc[i]); 1314 - kvfree(group_desc); 1315 - flex_groups = rcu_dereference(sbi->s_flex_groups); 1316 - if (flex_groups) { 1317 - for (i = 0; i < sbi->s_flex_groups_allocated; i++) 1318 - kvfree(flex_groups[i]); 1319 - kvfree(flex_groups); 1320 - } 1321 - rcu_read_unlock(); 1322 - percpu_counter_destroy(&sbi->s_freeclusters_counter); 1323 - percpu_counter_destroy(&sbi->s_freeinodes_counter); 1324 - percpu_counter_destroy(&sbi->s_dirs_counter); 1325 - percpu_counter_destroy(&sbi->s_dirtyclusters_counter); 1326 - percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); 1327 - percpu_free_rwsem(&sbi->s_writepages_rwsem); 1241 + ext4_group_desc_free(sbi); 1242 + ext4_flex_groups_free(sbi); 1243 + ext4_percpu_param_destroy(sbi); 1328 1244 #ifdef CONFIG_QUOTA 1329 1245 for (i = 0; i < EXT4_MAXQUOTAS; i++) 1330 1246 kfree(get_qf_name(sb, sbi, i)); ··· 1379 1325 inode_set_iversion(&ei->vfs_inode, 1); 1380 1326 ei->i_flags = 0; 1381 1327 spin_lock_init(&ei->i_raw_lock); 1382 - INIT_LIST_HEAD(&ei->i_prealloc_list); 1328 + ei->i_prealloc_node = RB_ROOT; 1383 1329 atomic_set(&ei->i_prealloc_active, 0); 1384 - spin_lock_init(&ei->i_prealloc_lock); 1330 + rwlock_init(&ei->i_prealloc_lock); 1385 1331 ext4_es_init_tree(&ei->i_es_tree); 1386 1332 rwlock_init(&ei->i_es_lock); 1387 1333 INIT_LIST_HEAD(&ei->i_es_list); ··· 4641 4587 struct ext4_super_block *es, 4642 4588 int silent) 4643 4589 { 4590 + struct ext4_sb_info *sbi = EXT4_SB(sb); 4591 + 4644 4592 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && 4645 4593 (ext4_has_compat_features(sb) || 4646 4594 ext4_has_ro_compat_features(sb) || ··· 4712 4656 if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) 4713 4657 return -EINVAL; 4714 4658 4659 + if (sbi->s_daxdev) { 4660 + if (sb->s_blocksize == PAGE_SIZE) 4661 + set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); 4662 + else 4663 + ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); 4664 + } 4665 + 4666 + if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { 4667 + if (ext4_has_feature_inline_data(sb)) { 4668 + ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" 4669 + " that may contain inline data"); 4670 + return -EINVAL; 4671 + } 4672 + if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { 4673 + ext4_msg(sb, KERN_ERR, 4674 + "DAX unsupported by block device."); 4675 + return -EINVAL; 4676 + } 4677 + } 4678 + 4679 + if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { 4680 + ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", 4681 + es->s_encryption_level); 4682 + return -EINVAL; 4683 + } 4684 + 4715 4685 return 0; 4716 4686 } 4717 4687 4718 - static int ext4_geometry_check(struct super_block *sb, 4688 + static int ext4_check_geometry(struct super_block *sb, 4719 4689 struct ext4_super_block *es) 4720 4690 { 4721 4691 struct ext4_sb_info *sbi = EXT4_SB(sb); 4722 4692 __u64 blocks_count; 4693 + int err; 4694 + 4695 + if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) { 4696 + ext4_msg(sb, KERN_ERR, 4697 + "Number of reserved GDT blocks insanely large: %d", 4698 + le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); 4699 + return -EINVAL; 4700 + } 4701 + /* 4702 + * Test whether we have more sectors than will fit in sector_t, 4703 + * and whether the max offset is addressable by the page cache. 4704 + */ 4705 + err = generic_check_addressable(sb->s_blocksize_bits, 4706 + ext4_blocks_count(es)); 4707 + if (err) { 4708 + ext4_msg(sb, KERN_ERR, "filesystem" 4709 + " too large to mount safely on this system"); 4710 + return err; 4711 + } 4723 4712 4724 4713 /* check blocks count against device size */ 4725 4714 blocks_count = sb_bdev_nr_blocks(sb); ··· 4818 4717 } 4819 4718 4820 4719 return 0; 4821 - } 4822 - 4823 - static void ext4_group_desc_free(struct ext4_sb_info *sbi) 4824 - { 4825 - struct buffer_head **group_desc; 4826 - int i; 4827 - 4828 - rcu_read_lock(); 4829 - group_desc = rcu_dereference(sbi->s_group_desc); 4830 - for (i = 0; i < sbi->s_gdb_count; i++) 4831 - brelse(group_desc[i]); 4832 - kvfree(group_desc); 4833 - rcu_read_unlock(); 4834 4720 } 4835 4721 4836 4722 static int ext4_group_desc_init(struct super_block *sb, ··· 4969 4881 return -EINVAL; 4970 4882 } 4971 4883 4972 - static int ext4_journal_data_mode_check(struct super_block *sb) 4884 + static int ext4_check_journal_data_mode(struct super_block *sb) 4973 4885 { 4974 4886 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 4975 4887 printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with " ··· 5112 5024 return ret; 5113 5025 } 5114 5026 5027 + static void ext4_hash_info_init(struct super_block *sb) 5028 + { 5029 + struct ext4_sb_info *sbi = EXT4_SB(sb); 5030 + struct ext4_super_block *es = sbi->s_es; 5031 + unsigned int i; 5032 + 5033 + for (i = 0; i < 4; i++) 5034 + sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); 5035 + 5036 + sbi->s_def_hash_version = es->s_def_hash_version; 5037 + if (ext4_has_feature_dir_index(sb)) { 5038 + i = le32_to_cpu(es->s_flags); 5039 + if (i & EXT2_FLAGS_UNSIGNED_HASH) 5040 + sbi->s_hash_unsigned = 3; 5041 + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { 5042 + #ifdef __CHAR_UNSIGNED__ 5043 + if (!sb_rdonly(sb)) 5044 + es->s_flags |= 5045 + cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); 5046 + sbi->s_hash_unsigned = 3; 5047 + #else 5048 + if (!sb_rdonly(sb)) 5049 + es->s_flags |= 5050 + cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); 5051 + #endif 5052 + } 5053 + } 5054 + } 5055 + 5056 + static int ext4_block_group_meta_init(struct super_block *sb, int silent) 5057 + { 5058 + struct ext4_sb_info *sbi = EXT4_SB(sb); 5059 + struct ext4_super_block *es = sbi->s_es; 5060 + int has_huge_files; 5061 + 5062 + has_huge_files = ext4_has_feature_huge_file(sb); 5063 + sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, 5064 + has_huge_files); 5065 + sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); 5066 + 5067 + sbi->s_desc_size = le16_to_cpu(es->s_desc_size); 5068 + if (ext4_has_feature_64bit(sb)) { 5069 + if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || 5070 + sbi->s_desc_size > EXT4_MAX_DESC_SIZE || 5071 + !is_power_of_2(sbi->s_desc_size)) { 5072 + ext4_msg(sb, KERN_ERR, 5073 + "unsupported descriptor size %lu", 5074 + sbi->s_desc_size); 5075 + return -EINVAL; 5076 + } 5077 + } else 5078 + sbi->s_desc_size = EXT4_MIN_DESC_SIZE; 5079 + 5080 + sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); 5081 + sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); 5082 + 5083 + sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb); 5084 + if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) { 5085 + if (!silent) 5086 + ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); 5087 + return -EINVAL; 5088 + } 5089 + if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || 5090 + sbi->s_inodes_per_group > sb->s_blocksize * 8) { 5091 + ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", 5092 + sbi->s_inodes_per_group); 5093 + return -EINVAL; 5094 + } 5095 + sbi->s_itb_per_group = sbi->s_inodes_per_group / 5096 + sbi->s_inodes_per_block; 5097 + sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb); 5098 + sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; 5099 + sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); 5100 + sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); 5101 + 5102 + return 0; 5103 + } 5104 + 5115 5105 static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) 5116 5106 { 5117 5107 struct ext4_super_block *es = NULL; 5118 5108 struct ext4_sb_info *sbi = EXT4_SB(sb); 5119 - struct flex_groups **flex_groups; 5120 - ext4_fsblk_t block; 5121 5109 ext4_fsblk_t logical_sb_block; 5122 5110 struct inode *root; 5123 5111 int ret = -ENOMEM; 5124 5112 unsigned int i; 5125 - int needs_recovery, has_huge_files; 5113 + int needs_recovery; 5126 5114 int err = 0; 5127 5115 ext4_group_t first_not_zeroed; 5128 5116 struct ext4_fs_context *ctx = fc->fs_private; ··· 5258 5094 if (ext4_encoding_init(sb, es)) 5259 5095 goto failed_mount; 5260 5096 5261 - if (ext4_journal_data_mode_check(sb)) 5097 + if (ext4_check_journal_data_mode(sb)) 5262 5098 goto failed_mount; 5263 5099 5264 5100 sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | ··· 5270 5106 if (ext4_check_feature_compatibility(sb, es, silent)) 5271 5107 goto failed_mount; 5272 5108 5273 - if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) { 5274 - ext4_msg(sb, KERN_ERR, 5275 - "Number of reserved GDT blocks insanely large: %d", 5276 - le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); 5109 + if (ext4_block_group_meta_init(sb, silent)) 5277 5110 goto failed_mount; 5278 - } 5279 5111 5280 - if (sbi->s_daxdev) { 5281 - if (sb->s_blocksize == PAGE_SIZE) 5282 - set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); 5283 - else 5284 - ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); 5285 - } 5286 - 5287 - if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { 5288 - if (ext4_has_feature_inline_data(sb)) { 5289 - ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" 5290 - " that may contain inline data"); 5291 - goto failed_mount; 5292 - } 5293 - if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { 5294 - ext4_msg(sb, KERN_ERR, 5295 - "DAX unsupported by block device."); 5296 - goto failed_mount; 5297 - } 5298 - } 5299 - 5300 - if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { 5301 - ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", 5302 - es->s_encryption_level); 5303 - goto failed_mount; 5304 - } 5305 - 5306 - has_huge_files = ext4_has_feature_huge_file(sb); 5307 - sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, 5308 - has_huge_files); 5309 - sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); 5310 - 5311 - sbi->s_desc_size = le16_to_cpu(es->s_desc_size); 5312 - if (ext4_has_feature_64bit(sb)) { 5313 - if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || 5314 - sbi->s_desc_size > EXT4_MAX_DESC_SIZE || 5315 - !is_power_of_2(sbi->s_desc_size)) { 5316 - ext4_msg(sb, KERN_ERR, 5317 - "unsupported descriptor size %lu", 5318 - sbi->s_desc_size); 5319 - goto failed_mount; 5320 - } 5321 - } else 5322 - sbi->s_desc_size = EXT4_MIN_DESC_SIZE; 5323 - 5324 - sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); 5325 - sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); 5326 - 5327 - sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb); 5328 - if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) { 5329 - if (!silent) 5330 - ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); 5331 - goto failed_mount; 5332 - } 5333 - if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || 5334 - sbi->s_inodes_per_group > sb->s_blocksize * 8) { 5335 - ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", 5336 - sbi->s_inodes_per_group); 5337 - goto failed_mount; 5338 - } 5339 - sbi->s_itb_per_group = sbi->s_inodes_per_group / 5340 - sbi->s_inodes_per_block; 5341 - sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb); 5342 - sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; 5343 - sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); 5344 - sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); 5345 - 5346 - for (i = 0; i < 4; i++) 5347 - sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); 5348 - sbi->s_def_hash_version = es->s_def_hash_version; 5349 - if (ext4_has_feature_dir_index(sb)) { 5350 - i = le32_to_cpu(es->s_flags); 5351 - if (i & EXT2_FLAGS_UNSIGNED_HASH) 5352 - sbi->s_hash_unsigned = 3; 5353 - else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { 5354 - #ifdef __CHAR_UNSIGNED__ 5355 - if (!sb_rdonly(sb)) 5356 - es->s_flags |= 5357 - cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); 5358 - sbi->s_hash_unsigned = 3; 5359 - #else 5360 - if (!sb_rdonly(sb)) 5361 - es->s_flags |= 5362 - cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); 5363 - #endif 5364 - } 5365 - } 5112 + ext4_hash_info_init(sb); 5366 5113 5367 5114 if (ext4_handle_clustersize(sb)) 5368 5115 goto failed_mount; 5369 5116 5370 - /* 5371 - * Test whether we have more sectors than will fit in sector_t, 5372 - * and whether the max offset is addressable by the page cache. 5373 - */ 5374 - err = generic_check_addressable(sb->s_blocksize_bits, 5375 - ext4_blocks_count(es)); 5376 - if (err) { 5377 - ext4_msg(sb, KERN_ERR, "filesystem" 5378 - " too large to mount safely on this system"); 5379 - goto failed_mount; 5380 - } 5381 - 5382 - if (ext4_geometry_check(sb, es)) 5117 + if (ext4_check_geometry(sb, es)) 5383 5118 goto failed_mount; 5384 5119 5385 5120 timer_setup(&sbi->s_err_report, print_daily_error_info, 0); ··· 5503 5440 sbi->s_journal->j_commit_callback = 5504 5441 ext4_journal_commit_callback; 5505 5442 5506 - block = ext4_count_free_clusters(sb); 5507 - ext4_free_blocks_count_set(sbi->s_es, 5508 - EXT4_C2B(sbi, block)); 5509 - err = percpu_counter_init(&sbi->s_freeclusters_counter, block, 5510 - GFP_KERNEL); 5511 - if (!err) { 5512 - unsigned long freei = ext4_count_free_inodes(sb); 5513 - sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); 5514 - err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, 5515 - GFP_KERNEL); 5516 - } 5517 - if (!err) 5518 - err = percpu_counter_init(&sbi->s_dirs_counter, 5519 - ext4_count_dirs(sb), GFP_KERNEL); 5520 - if (!err) 5521 - err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, 5522 - GFP_KERNEL); 5523 - if (!err) 5524 - err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, 5525 - GFP_KERNEL); 5526 - if (!err) 5527 - err = percpu_init_rwsem(&sbi->s_writepages_rwsem); 5528 - 5529 - if (err) { 5530 - ext4_msg(sb, KERN_ERR, "insufficient memory"); 5443 + if (ext4_percpu_param_init(sbi)) 5531 5444 goto failed_mount6; 5532 - } 5533 5445 5534 5446 if (ext4_has_feature_flex_bg(sb)) 5535 5447 if (!ext4_fill_flex_info(sb)) { ··· 5586 5548 ext4_unregister_li_request(sb); 5587 5549 failed_mount6: 5588 5550 ext4_mb_release(sb); 5589 - rcu_read_lock(); 5590 - flex_groups = rcu_dereference(sbi->s_flex_groups); 5591 - if (flex_groups) { 5592 - for (i = 0; i < sbi->s_flex_groups_allocated; i++) 5593 - kvfree(flex_groups[i]); 5594 - kvfree(flex_groups); 5595 - } 5596 - rcu_read_unlock(); 5597 - percpu_counter_destroy(&sbi->s_freeclusters_counter); 5598 - percpu_counter_destroy(&sbi->s_freeinodes_counter); 5599 - percpu_counter_destroy(&sbi->s_dirs_counter); 5600 - percpu_counter_destroy(&sbi->s_dirtyclusters_counter); 5601 - percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); 5602 - percpu_free_rwsem(&sbi->s_writepages_rwsem); 5551 + ext4_flex_groups_free(sbi); 5552 + ext4_percpu_param_destroy(sbi); 5603 5553 failed_mount5: 5604 5554 ext4_ext_release(sb); 5605 5555 ext4_release_system_zone(sb); ··· 6894 6868 * last time. 6895 6869 */ 6896 6870 sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY; 6897 - } 6898 - 6899 - /* 6900 - * When we journal data on quota file, we have to flush journal to see 6901 - * all updates to the file when we bypass pagecache... 6902 - */ 6903 - if (EXT4_SB(sb)->s_journal && 6904 - ext4_should_journal_data(d_inode(path->dentry))) { 6905 - /* 6906 - * We don't need to lock updates but journal_flush() could 6907 - * otherwise be livelocked... 6908 - */ 6909 - jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 6910 - err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); 6911 - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 6912 - if (err) 6913 - return err; 6914 6871 } 6915 6872 6916 6873 lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
-2
fs/ext4/sysfs.c
··· 214 214 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); 215 215 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 216 216 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 217 - EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc); 218 217 EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); 219 218 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); 220 219 EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); ··· 263 264 ATTR_LIST(mb_order2_req), 264 265 ATTR_LIST(mb_stream_req), 265 266 ATTR_LIST(mb_group_prealloc), 266 - ATTR_LIST(mb_max_inode_prealloc), 267 267 ATTR_LIST(mb_max_linear_groups), 268 268 ATTR_LIST(max_writeback_mb_bump), 269 269 ATTR_LIST(extent_max_zeroout_kb),
+14 -16
fs/ext4/verity.c
··· 42 42 loff_t pos) 43 43 { 44 44 while (count) { 45 - size_t n = min_t(size_t, count, 46 - PAGE_SIZE - offset_in_page(pos)); 47 - struct page *page; 45 + struct folio *folio; 46 + size_t n; 48 47 49 - page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, 48 + folio = read_mapping_folio(inode->i_mapping, pos >> PAGE_SHIFT, 50 49 NULL); 51 - if (IS_ERR(page)) 52 - return PTR_ERR(page); 50 + if (IS_ERR(folio)) 51 + return PTR_ERR(folio); 53 52 54 - memcpy_from_page(buf, page, offset_in_page(pos), n); 55 - 56 - put_page(page); 53 + n = memcpy_from_file_folio(buf, folio, pos, count); 54 + folio_put(folio); 57 55 58 56 buf += n; 59 57 pos += n; ··· 361 363 pgoff_t index, 362 364 unsigned long num_ra_pages) 363 365 { 364 - struct page *page; 366 + struct folio *folio; 365 367 366 368 index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; 367 369 368 - page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); 369 - if (!page || !PageUptodate(page)) { 370 + folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0); 371 + if (!folio || !folio_test_uptodate(folio)) { 370 372 DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); 371 373 372 - if (page) 373 - put_page(page); 374 + if (folio) 375 + folio_put(folio); 374 376 else if (num_ra_pages > 1) 375 377 page_cache_ra_unbounded(&ractl, num_ra_pages, 0); 376 - page = read_mapping_page(inode->i_mapping, index, NULL); 378 + folio = read_mapping_folio(inode->i_mapping, index, NULL); 377 379 } 378 - return page; 380 + return folio_file_page(folio, index); 379 381 } 380 382 381 383 static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf,
+1 -1
fs/iomap/buffered-io.c
··· 467 467 */ 468 468 struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) 469 469 { 470 - unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; 470 + unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS; 471 471 struct folio *folio; 472 472 473 473 if (iter->flags & IOMAP_NOWAIT)
+3
fs/jbd2/transaction.c
··· 2387 2387 spin_unlock(&jh->b_state_lock); 2388 2388 write_unlock(&journal->j_state_lock); 2389 2389 jbd2_journal_put_journal_head(jh); 2390 + /* Already zapped buffer? Nothing to do... */ 2391 + if (!bh->b_bdev) 2392 + return 0; 2390 2393 return -EBUSY; 2391 2394 } 2392 2395 /*
+1 -2
fs/netfs/buffered_read.c
··· 341 341 { 342 342 struct netfs_io_request *rreq; 343 343 struct folio *folio; 344 - unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; 345 344 pgoff_t index = pos >> PAGE_SHIFT; 346 345 int ret; 347 346 348 347 DEFINE_READAHEAD(ractl, file, NULL, mapping, index); 349 348 350 349 retry: 351 - folio = __filemap_get_folio(mapping, index, fgp_flags, 350 + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, 352 351 mapping_gfp_mask(mapping)); 353 352 if (!folio) 354 353 return -ENOMEM;
+2 -10
fs/nfs/file.c
··· 306 306 return false; 307 307 } 308 308 309 - static struct folio * 310 - nfs_folio_grab_cache_write_begin(struct address_space *mapping, pgoff_t index) 311 - { 312 - unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; 313 - 314 - return __filemap_get_folio(mapping, index, fgp_flags, 315 - mapping_gfp_mask(mapping)); 316 - } 317 - 318 309 /* 319 310 * This does the "real" work of the write. We must allocate and lock the 320 311 * page to be sent back to the generic routine, which then copies the ··· 326 335 file, mapping->host->i_ino, len, (long long) pos); 327 336 328 337 start: 329 - folio = nfs_folio_grab_cache_write_begin(mapping, pos >> PAGE_SHIFT); 338 + folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, FGP_WRITEBEGIN, 339 + mapping_gfp_mask(mapping)); 330 340 if (!folio) 331 341 return -ENOMEM; 332 342 *pagep = &folio->page;
+21
include/linux/fscrypt.h
··· 273 273 return (struct page *)page_private(bounce_page); 274 274 } 275 275 276 + static inline bool fscrypt_is_bounce_folio(struct folio *folio) 277 + { 278 + return folio->mapping == NULL; 279 + } 280 + 281 + static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio) 282 + { 283 + return bounce_folio->private; 284 + } 285 + 276 286 void fscrypt_free_bounce_page(struct page *bounce_page); 277 287 278 288 /* policy.c */ ··· 451 441 } 452 442 453 443 static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) 444 + { 445 + WARN_ON_ONCE(1); 446 + return ERR_PTR(-EINVAL); 447 + } 448 + 449 + static inline bool fscrypt_is_bounce_folio(struct folio *folio) 450 + { 451 + return false; 452 + } 453 + 454 + static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio) 454 455 { 455 456 WARN_ON_ONCE(1); 456 457 return ERR_PTR(-EINVAL);
-5
include/linux/page-flags.h
··· 762 762 #define folio_start_writeback_keepwrite(folio) \ 763 763 __folio_start_writeback(folio, true) 764 764 765 - static inline void set_page_writeback_keepwrite(struct page *page) 766 - { 767 - folio_start_writeback_keepwrite(page_folio(page)); 768 - } 769 - 770 765 static inline bool test_set_page_writeback(struct page *page) 771 766 { 772 767 return set_page_writeback(page);
+2
include/linux/pagemap.h
··· 507 507 #define FGP_ENTRY 0x00000080 508 508 #define FGP_STABLE 0x00000100 509 509 510 + #define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE) 511 + 510 512 struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, 511 513 int fgp_flags, gfp_t gfp); 512 514 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
-7
include/trace/events/ext4.h
··· 584 584 (unsigned long) __entry->index) 585 585 ); 586 586 587 - DEFINE_EVENT(ext4__page_op, ext4_writepage, 588 - 589 - TP_PROTO(struct page *page), 590 - 591 - TP_ARGS(page) 592 - ); 593 - 594 587 DEFINE_EVENT(ext4__page_op, ext4_readpage, 595 588 596 589 TP_PROTO(struct page *page),
+117
include/uapi/linux/ext4.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ 2 + 3 + #ifndef _UAPI_LINUX_EXT4_H 4 + #define _UAPI_LINUX_EXT4_H 5 + #include <linux/fiemap.h> 6 + #include <linux/fs.h> 7 + #include <linux/ioctl.h> 8 + #include <linux/types.h> 9 + 10 + /* 11 + * ext4-specific ioctl commands 12 + */ 13 + #define EXT4_IOC_GETVERSION _IOR('f', 3, long) 14 + #define EXT4_IOC_SETVERSION _IOW('f', 4, long) 15 + #define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION 16 + #define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION 17 + #define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) 18 + #define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) 19 + #define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) 20 + #define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) 21 + #define EXT4_IOC_MIGRATE _IO('f', 9) 22 + /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ 23 + /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ 24 + #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) 25 + #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) 26 + #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) 27 + #define EXT4_IOC_SWAP_BOOT _IO('f', 17) 28 + #define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) 29 + /* ioctl codes 19--39 are reserved for fscrypt */ 30 + #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) 31 + #define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) 32 + #define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) 33 + #define EXT4_IOC_CHECKPOINT _IOW('f', 43, __u32) 34 + #define EXT4_IOC_GETFSUUID _IOR('f', 44, struct fsuuid) 35 + #define EXT4_IOC_SETFSUUID _IOW('f', 44, struct fsuuid) 36 + 37 + #define EXT4_IOC_SHUTDOWN _IOR('X', 125, __u32) 38 + 39 + /* 40 + * ioctl commands in 32 bit emulation 41 + */ 42 + #define EXT4_IOC32_GETVERSION _IOR('f', 3, int) 43 + #define EXT4_IOC32_SETVERSION _IOW('f', 4, int) 44 + #define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) 45 + #define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) 46 + #define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) 47 + #define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) 48 + #define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION 49 + #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION 50 + 51 + /* 52 + * Flags returned by EXT4_IOC_GETSTATE 53 + * 54 + * We only expose to userspace a subset of the state flags in 55 + * i_state_flags 56 + */ 57 + #define EXT4_STATE_FLAG_EXT_PRECACHED 0x00000001 58 + #define EXT4_STATE_FLAG_NEW 0x00000002 59 + #define EXT4_STATE_FLAG_NEWENTRY 0x00000004 60 + #define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008 61 + 62 + /* 63 + * Flags for ioctl EXT4_IOC_CHECKPOINT 64 + */ 65 + #define EXT4_IOC_CHECKPOINT_FLAG_DISCARD 0x1 66 + #define EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT 0x2 67 + #define EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN 0x4 68 + #define EXT4_IOC_CHECKPOINT_FLAG_VALID (EXT4_IOC_CHECKPOINT_FLAG_DISCARD | \ 69 + EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT | \ 70 + EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN) 71 + 72 + /* 73 + * Structure for EXT4_IOC_GETFSUUID/EXT4_IOC_SETFSUUID 74 + */ 75 + struct fsuuid { 76 + __u32 fsu_len; 77 + __u32 fsu_flags; 78 + __u8 fsu_uuid[]; 79 + }; 80 + 81 + /* 82 + * Structure for EXT4_IOC_MOVE_EXT 83 + */ 84 + struct move_extent { 85 + __u32 reserved; /* should be zero */ 86 + __u32 donor_fd; /* donor file descriptor */ 87 + __u64 orig_start; /* logical start offset in block for orig */ 88 + __u64 donor_start; /* logical start offset in block for donor */ 89 + __u64 len; /* block length to be moved */ 90 + __u64 moved_len; /* moved block length */ 91 + }; 92 + 93 + /* 94 + * Flags used by EXT4_IOC_SHUTDOWN 95 + */ 96 + #define EXT4_GOING_FLAGS_DEFAULT 0x0 /* going down */ 97 + #define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ 98 + #define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ 99 + 100 + /* Used to pass group descriptor data when online resize is done */ 101 + struct ext4_new_group_input { 102 + __u32 group; /* Group number for this data */ 103 + __u64 block_bitmap; /* Absolute block number of block bitmap */ 104 + __u64 inode_bitmap; /* Absolute block number of inode bitmap */ 105 + __u64 inode_table; /* Absolute block number of inode table start */ 106 + __u32 blocks_count; /* Total number of blocks in this group */ 107 + __u16 reserved_blocks; /* Number of reserved blocks in this group */ 108 + __u16 unused; 109 + }; 110 + 111 + /* 112 + * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag. 113 + * It indicates that the entry in extent status cache is for a hole. 114 + */ 115 + #define EXT4_FIEMAP_EXTENT_HOLE 0x08000000 116 + 117 + #endif /* _UAPI_LINUX_EXT4_H */
+1 -3
mm/folio-compat.c
··· 106 106 struct page *grab_cache_page_write_begin(struct address_space *mapping, 107 107 pgoff_t index) 108 108 { 109 - unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; 110 - 111 - return pagecache_get_page(mapping, index, fgp_flags, 109 + return pagecache_get_page(mapping, index, FGP_WRITEBEGIN, 112 110 mapping_gfp_mask(mapping)); 113 111 } 114 112 EXPORT_SYMBOL(grab_cache_page_write_begin);