Merge tag 'xfs-6.2-merge-8' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

+253 -1

fs/iomap/buffered-io.c

··· 584 584 return iomap_read_inline_data(iter, folio); 585 585 } 586 586 587 - static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, 587 + static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, 588 588 size_t len, struct folio **foliop) 589 589 { 590 590 const struct iomap_page_ops *page_ops = iter->iomap.page_ops; ··· 618 618 status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM; 619 619 goto out_no_page; 620 620 } 621 + 622 + /* 623 + * Now we have a locked folio, before we do anything with it we need to 624 + * check that the iomap we have cached is not stale. The inode extent 625 + * mapping can change due to concurrent IO in flight (e.g. 626 + * IOMAP_UNWRITTEN state can change and memory reclaim could have 627 + * reclaimed a previously partially written page at this index after IO 628 + * completion before this write reaches this file offset) and hence we 629 + * could do the wrong thing here (zero a page range incorrectly or fail 630 + * to zero) and corrupt data. 631 + */ 632 + if (page_ops && page_ops->iomap_valid) { 633 + bool iomap_valid = page_ops->iomap_valid(iter->inode, 634 + &iter->iomap); 635 + if (!iomap_valid) { 636 + iter->iomap.flags |= IOMAP_F_STALE; 637 + status = 0; 638 + goto out_unlock; 639 + } 640 + } 641 + 621 642 if (pos + len > folio_pos(folio) + folio_size(folio)) 622 643 len = folio_pos(folio) + folio_size(folio) - pos; 623 644 ··· 794 773 status = iomap_write_begin(iter, pos, bytes, &folio); 795 774 if (unlikely(status)) 796 775 break; 776 + if (iter->iomap.flags & IOMAP_F_STALE) 777 + break; 797 778 798 779 page = folio_file_page(folio, pos >> PAGE_SHIFT); 799 780 if (mapping_writably_mapped(mapping)) ··· 855 832 } 856 833 EXPORT_SYMBOL_GPL(iomap_file_buffered_write); 857 834 835 + /* 836 + * Scan the data range passed to us for dirty page cache folios. If we find a 837 + * dirty folio, punch out the preceeding range and update the offset from which 838 + * the next punch will start from. 839 + * 840 + * We can punch out storage reservations under clean pages because they either 841 + * contain data that has been written back - in which case the delalloc punch 842 + * over that range is a no-op - or they have been read faults in which case they 843 + * contain zeroes and we can remove the delalloc backing range and any new 844 + * writes to those pages will do the normal hole filling operation... 845 + * 846 + * This makes the logic simple: we only need to keep the delalloc extents only 847 + * over the dirty ranges of the page cache. 848 + * 849 + * This function uses [start_byte, end_byte) intervals (i.e. open ended) to 850 + * simplify range iterations. 851 + */ 852 + static int iomap_write_delalloc_scan(struct inode *inode, 853 + loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, 854 + int (*punch)(struct inode *inode, loff_t offset, loff_t length)) 855 + { 856 + while (start_byte < end_byte) { 857 + struct folio *folio; 858 + 859 + /* grab locked page */ 860 + folio = filemap_lock_folio(inode->i_mapping, 861 + start_byte >> PAGE_SHIFT); 862 + if (!folio) { 863 + start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) + 864 + PAGE_SIZE; 865 + continue; 866 + } 867 + 868 + /* if dirty, punch up to offset */ 869 + if (folio_test_dirty(folio)) { 870 + if (start_byte > *punch_start_byte) { 871 + int error; 872 + 873 + error = punch(inode, *punch_start_byte, 874 + start_byte - *punch_start_byte); 875 + if (error) { 876 + folio_unlock(folio); 877 + folio_put(folio); 878 + return error; 879 + } 880 + } 881 + 882 + /* 883 + * Make sure the next punch start is correctly bound to 884 + * the end of this data range, not the end of the folio. 885 + */ 886 + *punch_start_byte = min_t(loff_t, end_byte, 887 + folio_next_index(folio) << PAGE_SHIFT); 888 + } 889 + 890 + /* move offset to start of next folio in range */ 891 + start_byte = folio_next_index(folio) << PAGE_SHIFT; 892 + folio_unlock(folio); 893 + folio_put(folio); 894 + } 895 + return 0; 896 + } 897 + 898 + /* 899 + * Punch out all the delalloc blocks in the range given except for those that 900 + * have dirty data still pending in the page cache - those are going to be 901 + * written and so must still retain the delalloc backing for writeback. 902 + * 903 + * As we are scanning the page cache for data, we don't need to reimplement the 904 + * wheel - mapping_seek_hole_data() does exactly what we need to identify the 905 + * start and end of data ranges correctly even for sub-folio block sizes. This 906 + * byte range based iteration is especially convenient because it means we 907 + * don't have to care about variable size folios, nor where the start or end of 908 + * the data range lies within a folio, if they lie within the same folio or even 909 + * if there are multiple discontiguous data ranges within the folio. 910 + * 911 + * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so 912 + * can return data ranges that exist in the cache beyond EOF. e.g. a page fault 913 + * spanning EOF will initialise the post-EOF data to zeroes and mark it up to 914 + * date. A write page fault can then mark it dirty. If we then fail a write() 915 + * beyond EOF into that up to date cached range, we allocate a delalloc block 916 + * beyond EOF and then have to punch it out. Because the range is up to date, 917 + * mapping_seek_hole_data() will return it, and we will skip the punch because 918 + * the folio is dirty. THis is incorrect - we always need to punch out delalloc 919 + * beyond EOF in this case as writeback will never write back and covert that 920 + * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF, 921 + * resulting in always punching out the range from the EOF to the end of the 922 + * range the iomap spans. 923 + * 924 + * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it 925 + * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA 926 + * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte) 927 + * returns the end of the data range (data_end). Using closed intervals would 928 + * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose 929 + * the code to subtle off-by-one bugs.... 930 + */ 931 + static int iomap_write_delalloc_release(struct inode *inode, 932 + loff_t start_byte, loff_t end_byte, 933 + int (*punch)(struct inode *inode, loff_t pos, loff_t length)) 934 + { 935 + loff_t punch_start_byte = start_byte; 936 + loff_t scan_end_byte = min(i_size_read(inode), end_byte); 937 + int error = 0; 938 + 939 + /* 940 + * Lock the mapping to avoid races with page faults re-instantiating 941 + * folios and dirtying them via ->page_mkwrite whilst we walk the 942 + * cache and perform delalloc extent removal. Failing to do this can 943 + * leave dirty pages with no space reservation in the cache. 944 + */ 945 + filemap_invalidate_lock(inode->i_mapping); 946 + while (start_byte < scan_end_byte) { 947 + loff_t data_end; 948 + 949 + start_byte = mapping_seek_hole_data(inode->i_mapping, 950 + start_byte, scan_end_byte, SEEK_DATA); 951 + /* 952 + * If there is no more data to scan, all that is left is to 953 + * punch out the remaining range. 954 + */ 955 + if (start_byte == -ENXIO || start_byte == scan_end_byte) 956 + break; 957 + if (start_byte < 0) { 958 + error = start_byte; 959 + goto out_unlock; 960 + } 961 + WARN_ON_ONCE(start_byte < punch_start_byte); 962 + WARN_ON_ONCE(start_byte > scan_end_byte); 963 + 964 + /* 965 + * We find the end of this contiguous cached data range by 966 + * seeking from start_byte to the beginning of the next hole. 967 + */ 968 + data_end = mapping_seek_hole_data(inode->i_mapping, start_byte, 969 + scan_end_byte, SEEK_HOLE); 970 + if (data_end < 0) { 971 + error = data_end; 972 + goto out_unlock; 973 + } 974 + WARN_ON_ONCE(data_end <= start_byte); 975 + WARN_ON_ONCE(data_end > scan_end_byte); 976 + 977 + error = iomap_write_delalloc_scan(inode, &punch_start_byte, 978 + start_byte, data_end, punch); 979 + if (error) 980 + goto out_unlock; 981 + 982 + /* The next data search starts at the end of this one. */ 983 + start_byte = data_end; 984 + } 985 + 986 + if (punch_start_byte < end_byte) 987 + error = punch(inode, punch_start_byte, 988 + end_byte - punch_start_byte); 989 + out_unlock: 990 + filemap_invalidate_unlock(inode->i_mapping); 991 + return error; 992 + } 993 + 994 + /* 995 + * When a short write occurs, the filesystem may need to remove reserved space 996 + * that was allocated in ->iomap_begin from it's ->iomap_end method. For 997 + * filesystems that use delayed allocation, we need to punch out delalloc 998 + * extents from the range that are not dirty in the page cache. As the write can 999 + * race with page faults, there can be dirty pages over the delalloc extent 1000 + * outside the range of a short write but still within the delalloc extent 1001 + * allocated for this iomap. 1002 + * 1003 + * This function uses [start_byte, end_byte) intervals (i.e. open ended) to 1004 + * simplify range iterations. 1005 + * 1006 + * The punch() callback *must* only punch delalloc extents in the range passed 1007 + * to it. It must skip over all other types of extents in the range and leave 1008 + * them completely unchanged. It must do this punch atomically with respect to 1009 + * other extent modifications. 1010 + * 1011 + * The punch() callback may be called with a folio locked to prevent writeback 1012 + * extent allocation racing at the edge of the range we are currently punching. 1013 + * The locked folio may or may not cover the range being punched, so it is not 1014 + * safe for the punch() callback to lock folios itself. 1015 + * 1016 + * Lock order is: 1017 + * 1018 + * inode->i_rwsem (shared or exclusive) 1019 + * inode->i_mapping->invalidate_lock (exclusive) 1020 + * folio_lock() 1021 + * ->punch 1022 + * internal filesystem allocation lock 1023 + */ 1024 + int iomap_file_buffered_write_punch_delalloc(struct inode *inode, 1025 + struct iomap *iomap, loff_t pos, loff_t length, 1026 + ssize_t written, 1027 + int (*punch)(struct inode *inode, loff_t pos, loff_t length)) 1028 + { 1029 + loff_t start_byte; 1030 + loff_t end_byte; 1031 + int blocksize = i_blocksize(inode); 1032 + 1033 + if (iomap->type != IOMAP_DELALLOC) 1034 + return 0; 1035 + 1036 + /* If we didn't reserve the blocks, we're not allowed to punch them. */ 1037 + if (!(iomap->flags & IOMAP_F_NEW)) 1038 + return 0; 1039 + 1040 + /* 1041 + * start_byte refers to the first unused block after a short write. If 1042 + * nothing was written, round offset down to point at the first block in 1043 + * the range. 1044 + */ 1045 + if (unlikely(!written)) 1046 + start_byte = round_down(pos, blocksize); 1047 + else 1048 + start_byte = round_up(pos + written, blocksize); 1049 + end_byte = round_up(pos + length, blocksize); 1050 + 1051 + /* Nothing to do if we've written the entire delalloc extent */ 1052 + if (start_byte >= end_byte) 1053 + return 0; 1054 + 1055 + return iomap_write_delalloc_release(inode, start_byte, end_byte, 1056 + punch); 1057 + } 1058 + EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc); 1059 + 858 1060 static loff_t iomap_unshare_iter(struct iomap_iter *iter) 859 1061 { 860 1062 struct iomap *iomap = &iter->iomap; ··· 1104 856 status = iomap_write_begin(iter, pos, bytes, &folio); 1105 857 if (unlikely(status)) 1106 858 return status; 859 + if (iter->iomap.flags & IOMAP_F_STALE) 860 + break; 1107 861 1108 862 status = iomap_write_end(iter, pos, bytes, bytes, folio); 1109 863 if (WARN_ON_ONCE(status == 0)) ··· 1161 911 status = iomap_write_begin(iter, pos, bytes, &folio); 1162 912 if (status) 1163 913 return status; 914 + if (iter->iomap.flags & IOMAP_F_STALE) 915 + break; 1164 916 1165 917 offset = offset_in_folio(folio, pos); 1166 918 if (bytes > folio_size(folio) - offset)

+18 -1

fs/iomap/iter.c

··· 7 7 #include <linux/iomap.h> 8 8 #include "trace.h" 9 9 10 + /* 11 + * Advance to the next range we need to map. 12 + * 13 + * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully 14 + * processed - it was aborted because the extent the iomap spanned may have been 15 + * changed during the operation. In this case, the iteration behaviour is to 16 + * remap the unprocessed range of the iter, and that means we may need to remap 17 + * even when we've made no progress (i.e. iter->processed = 0). Hence the 18 + * "finished iterating" case needs to distinguish between 19 + * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we 20 + * need to remap the entire remaining range. 21 + */ 10 22 static inline int iomap_iter_advance(struct iomap_iter *iter) 11 23 { 24 + bool stale = iter->iomap.flags & IOMAP_F_STALE; 25 + 12 26 /* handle the previous iteration (if any) */ 13 27 if (iter->iomap.length) { 14 - if (iter->processed <= 0) 28 + if (iter->processed < 0) 15 29 return iter->processed; 30 + if (!iter->processed && !stale) 31 + return 0; 16 32 if (WARN_ON_ONCE(iter->processed > iomap_length(iter))) 17 33 return -EIO; 18 34 iter->pos += iter->processed; ··· 49 33 WARN_ON_ONCE(iter->iomap.offset > iter->pos); 50 34 WARN_ON_ONCE(iter->iomap.length == 0); 51 35 WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos); 36 + WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE); 52 37 53 38 trace_iomap_iter_dstmap(iter->inode, &iter->iomap); 54 39 if (iter->srcmap.type != IOMAP_HOLE)

+5 -3

fs/xfs/libxfs/xfs_bmap.c

··· 4058 4058 * the busy list. 4059 4059 */ 4060 4060 bma->datatype = XFS_ALLOC_NOBUSY; 4061 - if (whichfork == XFS_DATA_FORK) { 4061 + if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) { 4062 4062 bma->datatype |= XFS_ALLOC_USERDATA; 4063 4063 if (bma->offset == 0) 4064 4064 bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; ··· 4551 4551 * the extent. Just return the real extent at this offset. 4552 4552 */ 4553 4553 if (!isnullstartblock(bma.got.br_startblock)) { 4554 - xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); 4554 + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, 4555 + xfs_iomap_inode_sequence(ip, flags)); 4555 4556 *seq = READ_ONCE(ifp->if_seq); 4556 4557 goto out_trans_cancel; 4557 4558 } ··· 4600 4599 XFS_STATS_INC(mp, xs_xstrat_quick); 4601 4600 4602 4601 ASSERT(!isnullstartblock(bma.got.br_startblock)); 4603 - xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); 4602 + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, 4603 + xfs_iomap_inode_sequence(ip, flags)); 4604 4604 *seq = READ_ONCE(ifp->if_seq); 4605 4605 4606 4606 if (whichfork == XFS_COW_FORK)

-1

fs/xfs/libxfs/xfs_btree.h

··· 556 556 struct xfs_buf *bp; 557 557 558 558 block = xfs_btree_get_block(cur, level, &bp); 559 - ASSERT(block && xfs_btree_check_block(cur, block, level, bp) == 0); 560 559 561 560 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 562 561 return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);

+10 -8

fs/xfs/libxfs/xfs_errortag.h

··· 40 40 #define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25 41 41 #define XFS_ERRTAG_BMAP_FINISH_ONE 26 42 42 #define XFS_ERRTAG_AG_RESV_CRITICAL 27 43 + 43 44 /* 44 - * DEBUG mode instrumentation to test and/or trigger delayed allocation 45 - * block killing in the event of failed writes. When enabled, all 46 - * buffered writes are silenty dropped and handled as if they failed. 47 - * All delalloc blocks in the range of the write (including pre-existing 48 - * delalloc blocks!) are tossed as part of the write failure error 49 - * handling sequence. 45 + * Drop-writes support removed because write error handling cannot trash 46 + * pre-existing delalloc extents in any useful way anymore. We retain the 47 + * definition so that we can reject it as an invalid value in 48 + * xfs_errortag_valid(). 50 49 */ 51 50 #define XFS_ERRTAG_DROP_WRITES 28 52 51 #define XFS_ERRTAG_LOG_BAD_CRC 29 ··· 61 62 #define XFS_ERRTAG_LARP 39 62 63 #define XFS_ERRTAG_DA_LEAF_SPLIT 40 63 64 #define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41 64 - #define XFS_ERRTAG_MAX 42 65 + #define XFS_ERRTAG_WB_DELAY_MS 42 66 + #define XFS_ERRTAG_WRITE_DELAY_MS 43 67 + #define XFS_ERRTAG_MAX 44 65 68 66 69 /* 67 70 * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. ··· 96 95 #define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 97 96 #define XFS_RANDOM_BMAP_FINISH_ONE 1 98 97 #define XFS_RANDOM_AG_RESV_CRITICAL 4 99 - #define XFS_RANDOM_DROP_WRITES 1 100 98 #define XFS_RANDOM_LOG_BAD_CRC 1 101 99 #define XFS_RANDOM_LOG_ITEM_PIN 1 102 100 #define XFS_RANDOM_BUF_LRU_REF 2 ··· 109 109 #define XFS_RANDOM_LARP 1 110 110 #define XFS_RANDOM_DA_LEAF_SPLIT 1 111 111 #define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 112 + #define XFS_RANDOM_WB_DELAY_MS 3000 113 + #define XFS_RANDOM_WRITE_DELAY_MS 3000 112 114 113 115 #endif /* __XFS_ERRORTAG_H_ */

+130 -16

fs/xfs/libxfs/xfs_refcount.c

··· 815 815 /* Is this extent valid? */ 816 816 static inline bool 817 817 xfs_refc_valid( 818 - struct xfs_refcount_irec *rc) 818 + const struct xfs_refcount_irec *rc) 819 819 { 820 820 return rc->rc_startblock != NULLAGBLOCK; 821 + } 822 + 823 + static inline xfs_nlink_t 824 + xfs_refc_merge_refcount( 825 + const struct xfs_refcount_irec *irec, 826 + enum xfs_refc_adjust_op adjust) 827 + { 828 + /* Once a record hits MAXREFCOUNT, it is pinned there forever */ 829 + if (irec->rc_refcount == MAXREFCOUNT) 830 + return MAXREFCOUNT; 831 + return irec->rc_refcount + adjust; 832 + } 833 + 834 + static inline bool 835 + xfs_refc_want_merge_center( 836 + const struct xfs_refcount_irec *left, 837 + const struct xfs_refcount_irec *cleft, 838 + const struct xfs_refcount_irec *cright, 839 + const struct xfs_refcount_irec *right, 840 + bool cleft_is_cright, 841 + enum xfs_refc_adjust_op adjust, 842 + unsigned long long *ulenp) 843 + { 844 + unsigned long long ulen = left->rc_blockcount; 845 + xfs_nlink_t new_refcount; 846 + 847 + /* 848 + * To merge with a center record, both shoulder records must be 849 + * adjacent to the record we want to adjust. This is only true if 850 + * find_left and find_right made all four records valid. 851 + */ 852 + if (!xfs_refc_valid(left) || !xfs_refc_valid(right) || 853 + !xfs_refc_valid(cleft) || !xfs_refc_valid(cright)) 854 + return false; 855 + 856 + /* There must only be one record for the entire range. */ 857 + if (!cleft_is_cright) 858 + return false; 859 + 860 + /* The shoulder record refcounts must match the new refcount. */ 861 + new_refcount = xfs_refc_merge_refcount(cleft, adjust); 862 + if (left->rc_refcount != new_refcount) 863 + return false; 864 + if (right->rc_refcount != new_refcount) 865 + return false; 866 + 867 + /* 868 + * The new record cannot exceed the max length. ulen is a ULL as the 869 + * individual record block counts can be up to (u32 - 1) in length 870 + * hence we need to catch u32 addition overflows here. 871 + */ 872 + ulen += cleft->rc_blockcount + right->rc_blockcount; 873 + if (ulen >= MAXREFCEXTLEN) 874 + return false; 875 + 876 + *ulenp = ulen; 877 + return true; 878 + } 879 + 880 + static inline bool 881 + xfs_refc_want_merge_left( 882 + const struct xfs_refcount_irec *left, 883 + const struct xfs_refcount_irec *cleft, 884 + enum xfs_refc_adjust_op adjust) 885 + { 886 + unsigned long long ulen = left->rc_blockcount; 887 + xfs_nlink_t new_refcount; 888 + 889 + /* 890 + * For a left merge, the left shoulder record must be adjacent to the 891 + * start of the range. If this is true, find_left made left and cleft 892 + * contain valid contents. 893 + */ 894 + if (!xfs_refc_valid(left) || !xfs_refc_valid(cleft)) 895 + return false; 896 + 897 + /* Left shoulder record refcount must match the new refcount. */ 898 + new_refcount = xfs_refc_merge_refcount(cleft, adjust); 899 + if (left->rc_refcount != new_refcount) 900 + return false; 901 + 902 + /* 903 + * The new record cannot exceed the max length. ulen is a ULL as the 904 + * individual record block counts can be up to (u32 - 1) in length 905 + * hence we need to catch u32 addition overflows here. 906 + */ 907 + ulen += cleft->rc_blockcount; 908 + if (ulen >= MAXREFCEXTLEN) 909 + return false; 910 + 911 + return true; 912 + } 913 + 914 + static inline bool 915 + xfs_refc_want_merge_right( 916 + const struct xfs_refcount_irec *cright, 917 + const struct xfs_refcount_irec *right, 918 + enum xfs_refc_adjust_op adjust) 919 + { 920 + unsigned long long ulen = right->rc_blockcount; 921 + xfs_nlink_t new_refcount; 922 + 923 + /* 924 + * For a right merge, the right shoulder record must be adjacent to the 925 + * end of the range. If this is true, find_right made cright and right 926 + * contain valid contents. 927 + */ 928 + if (!xfs_refc_valid(right) || !xfs_refc_valid(cright)) 929 + return false; 930 + 931 + /* Right shoulder record refcount must match the new refcount. */ 932 + new_refcount = xfs_refc_merge_refcount(cright, adjust); 933 + if (right->rc_refcount != new_refcount) 934 + return false; 935 + 936 + /* 937 + * The new record cannot exceed the max length. ulen is a ULL as the 938 + * individual record block counts can be up to (u32 - 1) in length 939 + * hence we need to catch u32 addition overflows here. 940 + */ 941 + ulen += cright->rc_blockcount; 942 + if (ulen >= MAXREFCEXTLEN) 943 + return false; 944 + 945 + return true; 821 946 } 822 947 823 948 /* ··· 986 861 (cleft.rc_blockcount == cright.rc_blockcount); 987 862 988 863 /* Try to merge left, cleft, and right. cleft must == cright. */ 989 - ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount + 990 - right.rc_blockcount; 991 - if (xfs_refc_valid(&left) && xfs_refc_valid(&right) && 992 - xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal && 993 - left.rc_refcount == cleft.rc_refcount + adjust && 994 - right.rc_refcount == cleft.rc_refcount + adjust && 995 - ulen < MAXREFCEXTLEN) { 864 + if (xfs_refc_want_merge_center(&left, &cleft, &cright, &right, cequal, 865 + adjust, &ulen)) { 996 866 *shape_changed = true; 997 867 return xfs_refcount_merge_center_extents(cur, &left, &cleft, 998 868 &right, ulen, aglen); 999 869 } 1000 870 1001 871 /* Try to merge left and cleft. */ 1002 - ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount; 1003 - if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) && 1004 - left.rc_refcount == cleft.rc_refcount + adjust && 1005 - ulen < MAXREFCEXTLEN) { 872 + if (xfs_refc_want_merge_left(&left, &cleft, adjust)) { 1006 873 *shape_changed = true; 1007 874 error = xfs_refcount_merge_left_extent(cur, &left, &cleft, 1008 875 agbno, aglen); ··· 1010 893 } 1011 894 1012 895 /* Try to merge cright and right. */ 1013 - ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount; 1014 - if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) && 1015 - right.rc_refcount == cright.rc_refcount + adjust && 1016 - ulen < MAXREFCEXTLEN) { 896 + if (xfs_refc_want_merge_right(&cright, &right, adjust)) { 1017 897 *shape_changed = true; 1018 898 return xfs_refcount_merge_right_extent(cur, &right, &cright, 1019 899 aglen);

+3 -1

fs/xfs/libxfs/xfs_sb.c

··· 972 972 */ 973 973 if (xfs_has_lazysbcount(mp)) { 974 974 mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); 975 - mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); 975 + mp->m_sb.sb_ifree = min_t(uint64_t, 976 + percpu_counter_sum(&mp->m_ifree), 977 + mp->m_sb.sb_icount); 976 978 mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); 977 979 } 978 980

+29 -18

fs/xfs/scrub/agheader.c

··· 609 609 /* AGFL */ 610 610 611 611 struct xchk_agfl_info { 612 - unsigned int sz_entries; 612 + /* Number of AGFL entries that the AGF claims are in use. */ 613 + unsigned int agflcount; 614 + 615 + /* Number of AGFL entries that we found. */ 613 616 unsigned int nr_entries; 617 + 618 + /* Buffer to hold AGFL entries for extent checking. */ 614 619 xfs_agblock_t *entries; 620 + 621 + struct xfs_buf *agfl_bp; 615 622 struct xfs_scrub *sc; 616 623 }; 617 624 ··· 648 641 struct xfs_scrub *sc = sai->sc; 649 642 650 643 if (xfs_verify_agbno(sc->sa.pag, agbno) && 651 - sai->nr_entries < sai->sz_entries) 644 + sai->nr_entries < sai->agflcount) 652 645 sai->entries[sai->nr_entries++] = agbno; 653 646 else 654 - xchk_block_set_corrupt(sc, sc->sa.agfl_bp); 647 + xchk_block_set_corrupt(sc, sai->agfl_bp); 655 648 656 649 xchk_agfl_block_xref(sc, agbno); 657 650 ··· 703 696 xchk_agfl( 704 697 struct xfs_scrub *sc) 705 698 { 706 - struct xchk_agfl_info sai; 699 + struct xchk_agfl_info sai = { 700 + .sc = sc, 701 + }; 707 702 struct xfs_agf *agf; 708 703 xfs_agnumber_t agno = sc->sm->sm_agno; 709 - unsigned int agflcount; 710 704 unsigned int i; 711 705 int error; 712 706 707 + /* Lock the AGF and AGI so that nobody can touch this AG. */ 713 708 error = xchk_ag_read_headers(sc, agno, &sc->sa); 714 709 if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error)) 715 - goto out; 710 + return error; 716 711 if (!sc->sa.agf_bp) 717 712 return -EFSCORRUPTED; 718 - xchk_buffer_recheck(sc, sc->sa.agfl_bp); 713 + 714 + /* Try to read the AGFL, and verify its structure if we get it. */ 715 + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &sai.agfl_bp); 716 + if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error)) 717 + return error; 718 + xchk_buffer_recheck(sc, sai.agfl_bp); 719 719 720 720 xchk_agfl_xref(sc); 721 721 ··· 731 717 732 718 /* Allocate buffer to ensure uniqueness of AGFL entries. */ 733 719 agf = sc->sa.agf_bp->b_addr; 734 - agflcount = be32_to_cpu(agf->agf_flcount); 735 - if (agflcount > xfs_agfl_size(sc->mp)) { 720 + sai.agflcount = be32_to_cpu(agf->agf_flcount); 721 + if (sai.agflcount > xfs_agfl_size(sc->mp)) { 736 722 xchk_block_set_corrupt(sc, sc->sa.agf_bp); 737 723 goto out; 738 724 } 739 - memset(&sai, 0, sizeof(sai)); 740 - sai.sc = sc; 741 - sai.sz_entries = agflcount; 742 - sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, 743 - KM_MAYFAIL); 725 + sai.entries = kvcalloc(sai.agflcount, sizeof(xfs_agblock_t), 726 + XCHK_GFP_FLAGS); 744 727 if (!sai.entries) { 745 728 error = -ENOMEM; 746 729 goto out; 747 730 } 748 731 749 732 /* Check the blocks in the AGFL. */ 750 - error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr, 751 - sc->sa.agfl_bp, xchk_agfl_block, &sai); 733 + error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr, sai.agfl_bp, 734 + xchk_agfl_block, &sai); 752 735 if (error == -ECANCELED) { 753 736 error = 0; 754 737 goto out_free; ··· 753 742 if (error) 754 743 goto out_free; 755 744 756 - if (agflcount != sai.nr_entries) { 745 + if (sai.agflcount != sai.nr_entries) { 757 746 xchk_block_set_corrupt(sc, sc->sa.agf_bp); 758 747 goto out_free; 759 748 } ··· 769 758 } 770 759 771 760 out_free: 772 - kmem_free(sai.entries); 761 + kvfree(sai.entries); 773 762 out: 774 763 return error; 775 764 }

+67 -14

fs/xfs/scrub/agheader_repair.c

··· 442 442 /* AGFL */ 443 443 444 444 struct xrep_agfl { 445 + /* Bitmap of alleged AGFL blocks that we're not going to add. */ 446 + struct xbitmap crossed; 447 + 445 448 /* Bitmap of other OWN_AG metadata blocks. */ 446 449 struct xbitmap agmetablocks; 447 450 448 451 /* Bitmap of free space. */ 449 452 struct xbitmap *freesp; 453 + 454 + /* rmapbt cursor for finding crosslinked blocks */ 455 + struct xfs_btree_cur *rmap_cur; 450 456 451 457 struct xfs_scrub *sc; 452 458 }; ··· 483 477 return xbitmap_set_btcur_path(&ra->agmetablocks, cur); 484 478 } 485 479 480 + /* Strike out the blocks that are cross-linked according to the rmapbt. */ 481 + STATIC int 482 + xrep_agfl_check_extent( 483 + struct xrep_agfl *ra, 484 + uint64_t start, 485 + uint64_t len) 486 + { 487 + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(ra->sc->mp, start); 488 + xfs_agblock_t last_agbno = agbno + len - 1; 489 + int error; 490 + 491 + ASSERT(XFS_FSB_TO_AGNO(ra->sc->mp, start) == ra->sc->sa.pag->pag_agno); 492 + 493 + while (agbno <= last_agbno) { 494 + bool other_owners; 495 + 496 + error = xfs_rmap_has_other_keys(ra->rmap_cur, agbno, 1, 497 + &XFS_RMAP_OINFO_AG, &other_owners); 498 + if (error) 499 + return error; 500 + 501 + if (other_owners) { 502 + error = xbitmap_set(&ra->crossed, agbno, 1); 503 + if (error) 504 + return error; 505 + } 506 + 507 + if (xchk_should_terminate(ra->sc, &error)) 508 + return error; 509 + agbno++; 510 + } 511 + 512 + return 0; 513 + } 514 + 486 515 /* 487 516 * Map out all the non-AGFL OWN_AG space in this AG so that we can deduce 488 517 * which blocks belong to the AGFL. ··· 537 496 struct xrep_agfl ra; 538 497 struct xfs_mount *mp = sc->mp; 539 498 struct xfs_btree_cur *cur; 499 + struct xbitmap_range *br, *n; 540 500 int error; 541 501 542 502 ra.sc = sc; 543 503 ra.freesp = agfl_extents; 544 504 xbitmap_init(&ra.agmetablocks); 505 + xbitmap_init(&ra.crossed); 545 506 546 507 /* Find all space used by the free space btrees & rmapbt. */ 547 508 cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); 548 509 error = xfs_rmap_query_all(cur, xrep_agfl_walk_rmap, &ra); 549 - if (error) 550 - goto err; 551 510 xfs_btree_del_cursor(cur, error); 511 + if (error) 512 + goto out_bmp; 552 513 553 514 /* Find all blocks currently being used by the bnobt. */ 554 515 cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, 555 516 sc->sa.pag, XFS_BTNUM_BNO); 556 517 error = xbitmap_set_btblocks(&ra.agmetablocks, cur); 557 - if (error) 558 - goto err; 559 518 xfs_btree_del_cursor(cur, error); 519 + if (error) 520 + goto out_bmp; 560 521 561 522 /* Find all blocks currently being used by the cntbt. */ 562 523 cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, 563 524 sc->sa.pag, XFS_BTNUM_CNT); 564 525 error = xbitmap_set_btblocks(&ra.agmetablocks, cur); 565 - if (error) 566 - goto err; 567 - 568 526 xfs_btree_del_cursor(cur, error); 527 + if (error) 528 + goto out_bmp; 569 529 570 530 /* 571 531 * Drop the freesp meta blocks that are in use by btrees. 572 532 * The remaining blocks /should/ be AGFL blocks. 573 533 */ 574 534 error = xbitmap_disunion(agfl_extents, &ra.agmetablocks); 575 - xbitmap_destroy(&ra.agmetablocks); 576 535 if (error) 577 - return error; 536 + goto out_bmp; 537 + 538 + /* Strike out the blocks that are cross-linked. */ 539 + ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); 540 + for_each_xbitmap_extent(br, n, agfl_extents) { 541 + error = xrep_agfl_check_extent(&ra, br->start, br->len); 542 + if (error) 543 + break; 544 + } 545 + xfs_btree_del_cursor(ra.rmap_cur, error); 546 + if (error) 547 + goto out_bmp; 548 + error = xbitmap_disunion(agfl_extents, &ra.crossed); 549 + if (error) 550 + goto out_bmp; 578 551 579 552 /* 580 553 * Calculate the new AGFL size. If we found more blocks than fit in ··· 596 541 */ 597 542 *flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents), 598 543 xfs_agfl_size(mp)); 599 - return 0; 600 544 601 - err: 545 + out_bmp: 546 + xbitmap_destroy(&ra.crossed); 602 547 xbitmap_destroy(&ra.agmetablocks); 603 - xfs_btree_del_cursor(cur, error); 604 548 return error; 605 549 } 606 550 ··· 685 631 if (br->len) 686 632 break; 687 633 list_del(&br->list); 688 - kmem_free(br); 634 + kfree(br); 689 635 } 690 636 691 637 /* Write new AGFL to disk. */ ··· 751 697 * freespace overflow to the freespace btrees. 752 698 */ 753 699 sc->sa.agf_bp = agf_bp; 754 - sc->sa.agfl_bp = agfl_bp; 755 700 error = xrep_roll_ag_trans(sc); 756 701 if (error) 757 702 goto err;

+5 -6

fs/xfs/scrub/attr.c

··· 49 49 if (ab) { 50 50 if (sz <= ab->sz) 51 51 return 0; 52 - kmem_free(ab); 52 + kvfree(ab); 53 53 sc->buf = NULL; 54 54 } 55 55 ··· 79 79 * without the inode lock held, which means we can sleep. 80 80 */ 81 81 if (sc->flags & XCHK_TRY_HARDER) { 82 - error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, GFP_KERNEL); 82 + error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, 83 + XCHK_GFP_FLAGS); 83 84 if (error) 84 85 return error; 85 86 } ··· 139 138 * doesn't work, we overload the seen_enough variable to convey 140 139 * the error message back to the main scrub function. 141 140 */ 142 - error = xchk_setup_xattr_buf(sx->sc, valuelen, 143 - GFP_KERNEL | __GFP_RETRY_MAYFAIL); 141 + error = xchk_setup_xattr_buf(sx->sc, valuelen, XCHK_GFP_FLAGS); 144 142 if (error == -ENOMEM) 145 143 error = -EDEADLOCK; 146 144 if (error) { ··· 324 324 return 0; 325 325 326 326 /* Allocate memory for block usage checking. */ 327 - error = xchk_setup_xattr_buf(ds->sc, 0, 328 - GFP_KERNEL | __GFP_RETRY_MAYFAIL); 327 + error = xchk_setup_xattr_buf(ds->sc, 0, XCHK_GFP_FLAGS); 329 328 if (error == -ENOMEM) 330 329 return -EDEADLOCK; 331 330 if (error)

+6 -5

fs/xfs/scrub/bitmap.c

··· 10 10 #include "xfs_trans_resv.h" 11 11 #include "xfs_mount.h" 12 12 #include "xfs_btree.h" 13 + #include "scrub/scrub.h" 13 14 #include "scrub/bitmap.h" 14 15 15 16 /* ··· 26 25 { 27 26 struct xbitmap_range *bmr; 28 27 29 - bmr = kmem_alloc(sizeof(struct xbitmap_range), KM_MAYFAIL); 28 + bmr = kmalloc(sizeof(struct xbitmap_range), XCHK_GFP_FLAGS); 30 29 if (!bmr) 31 30 return -ENOMEM; 32 31 ··· 48 47 49 48 for_each_xbitmap_extent(bmr, n, bitmap) { 50 49 list_del(&bmr->list); 51 - kmem_free(bmr); 50 + kfree(bmr); 52 51 } 53 52 } 54 53 ··· 175 174 /* Total overlap, just delete ex. */ 176 175 lp = lp->next; 177 176 list_del(&br->list); 178 - kmem_free(br); 177 + kfree(br); 179 178 break; 180 179 case 0: 181 180 /* 182 181 * Deleting from the middle: add the new right extent 183 182 * and then shrink the left extent. 184 183 */ 185 - new_br = kmem_alloc(sizeof(struct xbitmap_range), 186 - KM_MAYFAIL); 184 + new_br = kmalloc(sizeof(struct xbitmap_range), 185 + XCHK_GFP_FLAGS); 187 186 if (!new_br) { 188 187 error = -ENOMEM; 189 188 goto out;

+119 -28

fs/xfs/scrub/bmap.c

··· 90 90 91 91 struct xchk_bmap_info { 92 92 struct xfs_scrub *sc; 93 + struct xfs_iext_cursor icur; 93 94 xfs_fileoff_t lastoff; 94 95 bool is_rt; 95 96 bool is_shared; ··· 145 144 xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, 146 145 irec->br_startoff); 147 146 return has_rmap; 147 + } 148 + 149 + static inline bool 150 + xchk_bmap_has_prev( 151 + struct xchk_bmap_info *info, 152 + struct xfs_bmbt_irec *irec) 153 + { 154 + struct xfs_bmbt_irec got; 155 + struct xfs_ifork *ifp; 156 + 157 + ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); 158 + 159 + if (!xfs_iext_peek_prev_extent(ifp, &info->icur, &got)) 160 + return false; 161 + if (got.br_startoff + got.br_blockcount != irec->br_startoff) 162 + return false; 163 + if (got.br_startblock + got.br_blockcount != irec->br_startblock) 164 + return false; 165 + if (got.br_state != irec->br_state) 166 + return false; 167 + return true; 168 + } 169 + 170 + static inline bool 171 + xchk_bmap_has_next( 172 + struct xchk_bmap_info *info, 173 + struct xfs_bmbt_irec *irec) 174 + { 175 + struct xfs_bmbt_irec got; 176 + struct xfs_ifork *ifp; 177 + 178 + ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); 179 + 180 + if (!xfs_iext_peek_next_extent(ifp, &info->icur, &got)) 181 + return false; 182 + if (irec->br_startoff + irec->br_blockcount != got.br_startoff) 183 + return false; 184 + if (irec->br_startblock + irec->br_blockcount != got.br_startblock) 185 + return false; 186 + if (got.br_state != irec->br_state) 187 + return false; 188 + return true; 148 189 } 149 190 150 191 /* Make sure that we have rmapbt records for this extent. */ ··· 257 214 if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK) 258 215 xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, 259 216 irec->br_startoff); 217 + 218 + /* 219 + * If the rmap starts before this bmbt record, make sure there's a bmbt 220 + * record for the previous offset that is contiguous with this mapping. 221 + * Skip this for CoW fork extents because the refcount btree (and not 222 + * the inode) is the ondisk owner for those extents. 223 + */ 224 + if (info->whichfork != XFS_COW_FORK && rmap.rm_startblock < agbno && 225 + !xchk_bmap_has_prev(info, irec)) { 226 + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, 227 + irec->br_startoff); 228 + return; 229 + } 230 + 231 + /* 232 + * If the rmap ends after this bmbt record, make sure there's a bmbt 233 + * record for the next offset that is contiguous with this mapping. 234 + * Skip this for CoW fork extents because the refcount btree (and not 235 + * the inode) is the ondisk owner for those extents. 236 + */ 237 + rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; 238 + if (info->whichfork != XFS_COW_FORK && 239 + rmap_end > agbno + irec->br_blockcount && 240 + !xchk_bmap_has_next(info, irec)) { 241 + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, 242 + irec->br_startoff); 243 + return; 244 + } 260 245 } 261 246 262 247 /* Cross-reference a single rtdev extent record. */ ··· 335 264 case XFS_COW_FORK: 336 265 xchk_xref_is_cow_staging(info->sc, agbno, 337 266 irec->br_blockcount); 267 + xchk_xref_is_not_shared(info->sc, agbno, 268 + irec->br_blockcount); 338 269 break; 339 270 } 340 271 ··· 370 297 } 371 298 372 299 /* Scrub a single extent record. */ 373 - STATIC int 300 + STATIC void 374 301 xchk_bmap_iextent( 375 302 struct xfs_inode *ip, 376 303 struct xchk_bmap_info *info, 377 304 struct xfs_bmbt_irec *irec) 378 305 { 379 306 struct xfs_mount *mp = info->sc->mp; 380 - int error = 0; 381 307 382 308 /* 383 309 * Check for out-of-order extents. This record could have come ··· 394 322 395 323 /* There should never be a "hole" extent in either extent list. */ 396 324 if (irec->br_startblock == HOLESTARTBLOCK) 397 - xchk_fblock_set_corrupt(info->sc, info->whichfork, 398 - irec->br_startoff); 399 - 400 - /* 401 - * Check for delalloc extents. We never iterate the ones in the 402 - * in-core extent scan, and we should never see these in the bmbt. 403 - */ 404 - if (isnullstartblock(irec->br_startblock)) 405 325 xchk_fblock_set_corrupt(info->sc, info->whichfork, 406 326 irec->br_startoff); 407 327 ··· 417 353 irec->br_startoff); 418 354 419 355 if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 420 - return 0; 356 + return; 421 357 422 358 if (info->is_rt) 423 359 xchk_bmap_rt_iextent_xref(ip, info, irec); 424 360 else 425 361 xchk_bmap_iextent_xref(ip, info, irec); 426 - 427 - info->lastoff = irec->br_startoff + irec->br_blockcount; 428 - return error; 429 362 } 430 363 431 364 /* Scrub a bmbt record. */ ··· 660 599 661 600 for_each_perag(sc->mp, agno, pag) { 662 601 error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag); 663 - if (error) 664 - break; 665 - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 666 - break; 602 + if (error || 603 + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { 604 + xfs_perag_put(pag); 605 + return error; 606 + } 667 607 } 668 - if (pag) 669 - xfs_perag_put(pag); 670 - return error; 608 + 609 + return 0; 610 + } 611 + 612 + /* Scrub a delalloc reservation from the incore extent map tree. */ 613 + STATIC void 614 + xchk_bmap_iextent_delalloc( 615 + struct xfs_inode *ip, 616 + struct xchk_bmap_info *info, 617 + struct xfs_bmbt_irec *irec) 618 + { 619 + struct xfs_mount *mp = info->sc->mp; 620 + 621 + /* 622 + * Check for out-of-order extents. This record could have come 623 + * from the incore list, for which there is no ordering check. 624 + */ 625 + if (irec->br_startoff < info->lastoff) 626 + xchk_fblock_set_corrupt(info->sc, info->whichfork, 627 + irec->br_startoff); 628 + 629 + if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount)) 630 + xchk_fblock_set_corrupt(info->sc, info->whichfork, 631 + irec->br_startoff); 632 + 633 + /* Make sure the extent points to a valid place. */ 634 + if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN) 635 + xchk_fblock_set_corrupt(info->sc, info->whichfork, 636 + irec->br_startoff); 671 637 } 672 638 673 639 /* ··· 714 626 struct xfs_inode *ip = sc->ip; 715 627 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 716 628 xfs_fileoff_t endoff; 717 - struct xfs_iext_cursor icur; 718 629 int error = 0; 719 630 720 631 /* Non-existent forks can be ignored. */ ··· 748 661 case XFS_DINODE_FMT_DEV: 749 662 case XFS_DINODE_FMT_LOCAL: 750 663 /* No mappings to check. */ 664 + if (whichfork == XFS_COW_FORK) 665 + xchk_fblock_set_corrupt(sc, whichfork, 0); 751 666 goto out; 752 667 case XFS_DINODE_FMT_EXTENTS: 753 668 break; ··· 779 690 /* Scrub extent records. */ 780 691 info.lastoff = 0; 781 692 ifp = xfs_ifork_ptr(ip, whichfork); 782 - for_each_xfs_iext(ifp, &icur, &irec) { 693 + for_each_xfs_iext(ifp, &info.icur, &irec) { 783 694 if (xchk_should_terminate(sc, &error) || 784 695 (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) 785 696 goto out; 786 - if (isnullstartblock(irec.br_startblock)) 787 - continue; 697 + 788 698 if (irec.br_startoff >= endoff) { 789 699 xchk_fblock_set_corrupt(sc, whichfork, 790 700 irec.br_startoff); 791 701 goto out; 792 702 } 793 - error = xchk_bmap_iextent(ip, &info, &irec); 794 - if (error) 795 - goto out; 703 + 704 + if (isnullstartblock(irec.br_startblock)) 705 + xchk_bmap_iextent_delalloc(ip, &info, &irec); 706 + else 707 + xchk_bmap_iextent(ip, &info, &irec); 708 + info.lastoff = irec.br_startoff + irec.br_blockcount; 796 709 } 797 710 798 711 error = xchk_bmap_check_rmaps(sc, whichfork);

+8 -6

fs/xfs/scrub/btree.c

··· 408 408 struct xfs_buf *bp) 409 409 { 410 410 struct xfs_btree_cur *cur = bs->cur; 411 - struct check_owner *co; 412 411 413 412 /* 414 413 * In theory, xfs_btree_get_block should only give us a null buffer ··· 430 431 * later scanning. 431 432 */ 432 433 if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) { 433 - co = kmem_alloc(sizeof(struct check_owner), 434 - KM_MAYFAIL); 434 + struct check_owner *co; 435 + 436 + co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS); 435 437 if (!co) 436 438 return -ENOMEM; 439 + 440 + INIT_LIST_HEAD(&co->list); 437 441 co->level = level; 438 442 co->daddr = xfs_buf_daddr(bp); 439 443 list_add_tail(&co->list, &bs->to_check); ··· 651 649 xchk_btree_set_corrupt(sc, cur, 0); 652 650 return 0; 653 651 } 654 - bs = kmem_zalloc(cur_sz, KM_NOFS | KM_MAYFAIL); 652 + bs = kzalloc(cur_sz, XCHK_GFP_FLAGS); 655 653 if (!bs) 656 654 return -ENOMEM; 657 655 bs->cur = cur; ··· 742 740 error = xchk_btree_check_block_owner(bs, co->level, 743 741 co->daddr); 744 742 list_del(&co->list); 745 - kmem_free(co); 743 + kfree(co); 746 744 } 747 - kmem_free(bs); 745 + kfree(bs); 748 746 749 747 return error; 750 748 }

+34 -14

fs/xfs/scrub/common.c

··· 424 424 if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF)) 425 425 return error; 426 426 427 - error = xfs_alloc_read_agfl(sa->pag, sc->tp, &sa->agfl_bp); 428 - if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL)) 429 - return error; 430 - 431 427 return 0; 432 428 } 433 429 ··· 511 515 struct xchk_ag *sa) 512 516 { 513 517 xchk_ag_btcur_free(sa); 514 - if (sa->agfl_bp) { 515 - xfs_trans_brelse(sc->tp, sa->agfl_bp); 516 - sa->agfl_bp = NULL; 517 - } 518 518 if (sa->agf_bp) { 519 519 xfs_trans_brelse(sc->tp, sa->agf_bp); 520 520 sa->agf_bp = NULL; ··· 781 789 trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa); 782 790 } 783 791 792 + static inline int 793 + xchk_metadata_inode_subtype( 794 + struct xfs_scrub *sc, 795 + unsigned int scrub_type) 796 + { 797 + __u32 smtype = sc->sm->sm_type; 798 + int error; 799 + 800 + sc->sm->sm_type = scrub_type; 801 + 802 + switch (scrub_type) { 803 + case XFS_SCRUB_TYPE_INODE: 804 + error = xchk_inode(sc); 805 + break; 806 + case XFS_SCRUB_TYPE_BMBTD: 807 + error = xchk_bmap_data(sc); 808 + break; 809 + default: 810 + ASSERT(0); 811 + error = -EFSCORRUPTED; 812 + break; 813 + } 814 + 815 + sc->sm->sm_type = smtype; 816 + return error; 817 + } 818 + 784 819 /* 785 820 * Scrub the attr/data forks of a metadata inode. The metadata inode must be 786 821 * pointed to by sc->ip and the ILOCK must be held. ··· 816 797 xchk_metadata_inode_forks( 817 798 struct xfs_scrub *sc) 818 799 { 819 - __u32 smtype; 820 800 bool shared; 821 801 int error; 822 802 823 803 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 824 804 return 0; 805 + 806 + /* Check the inode record. */ 807 + error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE); 808 + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) 809 + return error; 825 810 826 811 /* Metadata inodes don't live on the rt device. */ 827 812 if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) { ··· 846 823 } 847 824 848 825 /* Invoke the data fork scrubber. */ 849 - smtype = sc->sm->sm_type; 850 - sc->sm->sm_type = XFS_SCRUB_TYPE_BMBTD; 851 - error = xchk_bmap_data(sc); 852 - sc->sm->sm_type = smtype; 826 + error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD); 853 827 if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) 854 828 return error; 855 829 ··· 861 841 xchk_ino_set_corrupt(sc, sc->ip->i_ino); 862 842 } 863 843 864 - return error; 844 + return 0; 865 845 } 866 846 867 847 /*

+1 -1

fs/xfs/scrub/common.h

··· 25 25 26 26 if (fatal_signal_pending(current)) { 27 27 if (*error == 0) 28 - *error = -EAGAIN; 28 + *error = -EINTR; 29 29 return true; 30 30 } 31 31 return false;

+2 -2

fs/xfs/scrub/dabtree.c

··· 486 486 return 0; 487 487 488 488 /* Set up initial da state. */ 489 - ds = kmem_zalloc(sizeof(struct xchk_da_btree), KM_NOFS | KM_MAYFAIL); 489 + ds = kzalloc(sizeof(struct xchk_da_btree), XCHK_GFP_FLAGS); 490 490 if (!ds) 491 491 return -ENOMEM; 492 492 ds->dargs.dp = sc->ip; ··· 591 591 592 592 out_state: 593 593 xfs_da_state_free(ds->state); 594 - kmem_free(ds); 594 + kfree(ds); 595 595 return error; 596 596 }

+6 -4

fs/xfs/scrub/dir.c

··· 666 666 struct xfs_scrub *sc) 667 667 { 668 668 struct xfs_bmbt_irec got; 669 - struct xfs_da_args args; 669 + struct xfs_da_args args = { 670 + .dp = sc ->ip, 671 + .whichfork = XFS_DATA_FORK, 672 + .geo = sc->mp->m_dir_geo, 673 + .trans = sc->tp, 674 + }; 670 675 struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); 671 676 struct xfs_mount *mp = sc->mp; 672 677 xfs_fileoff_t leaf_lblk; ··· 694 689 free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET); 695 690 696 691 /* Is this a block dir? */ 697 - args.dp = sc->ip; 698 - args.geo = mp->m_dir_geo; 699 - args.trans = sc->tp; 700 692 error = xfs_dir2_isblock(&args, &is_block); 701 693 if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) 702 694 goto out;

+105 -4

fs/xfs/scrub/fscounters.c

··· 14 14 #include "xfs_health.h" 15 15 #include "xfs_btree.h" 16 16 #include "xfs_ag.h" 17 + #include "xfs_rtalloc.h" 18 + #include "xfs_inode.h" 17 19 #include "scrub/scrub.h" 18 20 #include "scrub/common.h" 19 21 #include "scrub/trace.h" ··· 44 42 * after this operation and use the difference in counter values to guess at 45 43 * our tolerance for mismatch between expected and actual counter values. 46 44 */ 45 + 46 + struct xchk_fscounters { 47 + struct xfs_scrub *sc; 48 + uint64_t icount; 49 + uint64_t ifree; 50 + uint64_t fdblocks; 51 + uint64_t frextents; 52 + unsigned long long icount_min; 53 + unsigned long long icount_max; 54 + }; 47 55 48 56 /* 49 57 * Since the expected value computation is lockless but only browses incore ··· 128 116 struct xchk_fscounters *fsc; 129 117 int error; 130 118 131 - sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0); 119 + sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS); 132 120 if (!sc->buf) 133 121 return -ENOMEM; 134 122 fsc = sc->buf; 123 + fsc->sc = sc; 135 124 136 125 xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max); 137 126 ··· 150 137 151 138 return xchk_trans_alloc(sc, 0); 152 139 } 140 + 141 + /* 142 + * Part 1: Collecting filesystem summary counts. For each AG, we add its 143 + * summary counts (total inodes, free inodes, free data blocks) to an incore 144 + * copy of the overall filesystem summary counts. 145 + * 146 + * To avoid false corruption reports in part 2, any failure in this part must 147 + * set the INCOMPLETE flag even when a negative errno is returned. This care 148 + * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, 149 + * ECANCELED) that are absorbed into a scrub state flag update by 150 + * xchk_*_process_error. 151 + */ 153 152 154 153 /* Count free space btree blocks manually for pre-lazysbcount filesystems. */ 155 154 static int ··· 250 225 } 251 226 if (pag) 252 227 xfs_perag_put(pag); 253 - if (error) 228 + if (error) { 229 + xchk_set_incomplete(sc); 254 230 return error; 231 + } 255 232 256 233 /* 257 234 * The global incore space reservation is taken from the incore ··· 293 266 294 267 return 0; 295 268 } 269 + 270 + #ifdef CONFIG_XFS_RT 271 + STATIC int 272 + xchk_fscount_add_frextent( 273 + struct xfs_mount *mp, 274 + struct xfs_trans *tp, 275 + const struct xfs_rtalloc_rec *rec, 276 + void *priv) 277 + { 278 + struct xchk_fscounters *fsc = priv; 279 + int error = 0; 280 + 281 + fsc->frextents += rec->ar_extcount; 282 + 283 + xchk_should_terminate(fsc->sc, &error); 284 + return error; 285 + } 286 + 287 + /* Calculate the number of free realtime extents from the realtime bitmap. */ 288 + STATIC int 289 + xchk_fscount_count_frextents( 290 + struct xfs_scrub *sc, 291 + struct xchk_fscounters *fsc) 292 + { 293 + struct xfs_mount *mp = sc->mp; 294 + int error; 295 + 296 + fsc->frextents = 0; 297 + if (!xfs_has_realtime(mp)) 298 + return 0; 299 + 300 + xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); 301 + error = xfs_rtalloc_query_all(sc->mp, sc->tp, 302 + xchk_fscount_add_frextent, fsc); 303 + if (error) { 304 + xchk_set_incomplete(sc); 305 + goto out_unlock; 306 + } 307 + 308 + out_unlock: 309 + xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); 310 + return error; 311 + } 312 + #else 313 + STATIC int 314 + xchk_fscount_count_frextents( 315 + struct xfs_scrub *sc, 316 + struct xchk_fscounters *fsc) 317 + { 318 + fsc->frextents = 0; 319 + return 0; 320 + } 321 + #endif /* CONFIG_XFS_RT */ 322 + 323 + /* 324 + * Part 2: Comparing filesystem summary counters. All we have to do here is 325 + * sum the percpu counters and compare them to what we've observed. 326 + */ 296 327 297 328 /* 298 329 * Is the @counter reasonably close to the @expected value? ··· 418 333 { 419 334 struct xfs_mount *mp = sc->mp; 420 335 struct xchk_fscounters *fsc = sc->buf; 421 - int64_t icount, ifree, fdblocks; 336 + int64_t icount, ifree, fdblocks, frextents; 422 337 int error; 423 338 424 339 /* Snapshot the percpu counters. */ 425 340 icount = percpu_counter_sum(&mp->m_icount); 426 341 ifree = percpu_counter_sum(&mp->m_ifree); 427 342 fdblocks = percpu_counter_sum(&mp->m_fdblocks); 343 + frextents = percpu_counter_sum(&mp->m_frextents); 428 344 429 345 /* No negative values, please! */ 430 - if (icount < 0 || ifree < 0 || fdblocks < 0) 346 + if (icount < 0 || ifree < 0 || fdblocks < 0 || frextents < 0) 431 347 xchk_set_corrupt(sc); 432 348 433 349 /* See if icount is obviously wrong. */ ··· 437 351 438 352 /* See if fdblocks is obviously wrong. */ 439 353 if (fdblocks > mp->m_sb.sb_dblocks) 354 + xchk_set_corrupt(sc); 355 + 356 + /* See if frextents is obviously wrong. */ 357 + if (frextents > mp->m_sb.sb_rextents) 440 358 xchk_set_corrupt(sc); 441 359 442 360 /* ··· 457 367 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) 458 368 return 0; 459 369 370 + /* Count the free extents counter for rt volumes. */ 371 + error = xchk_fscount_count_frextents(sc, fsc); 372 + if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) 373 + return error; 374 + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) 375 + return 0; 376 + 460 377 /* Compare the in-core counters with whatever we counted. */ 461 378 if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount)) 462 379 xchk_set_corrupt(sc); ··· 473 376 474 377 if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, 475 378 fsc->fdblocks)) 379 + xchk_set_corrupt(sc); 380 + 381 + if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, 382 + fsc->frextents)) 476 383 xchk_set_corrupt(sc); 477 384 478 385 return 0;

+1 -1

fs/xfs/scrub/inode.c

··· 365 365 * pagecache can't cache all the blocks in this file due to 366 366 * overly large offsets, flag the inode for admin review. 367 367 */ 368 - if (isize >= mp->m_super->s_maxbytes) 368 + if (isize > mp->m_super->s_maxbytes) 369 369 xchk_ino_set_warning(sc, ino); 370 370 371 371 /* di_nblocks */

+5 -3

fs/xfs/scrub/quota.c

··· 14 14 #include "xfs_inode.h" 15 15 #include "xfs_quota.h" 16 16 #include "xfs_qm.h" 17 + #include "xfs_bmap.h" 17 18 #include "scrub/scrub.h" 18 19 #include "scrub/common.h" 19 20 ··· 85 84 int error = 0; 86 85 87 86 if (xchk_should_terminate(sc, &error)) 88 - return -ECANCELED; 87 + return error; 89 88 90 89 /* 91 90 * Except for the root dquot, the actual dquot we got must either have ··· 190 189 for_each_xfs_iext(ifp, &icur, &irec) { 191 190 if (xchk_should_terminate(sc, &error)) 192 191 break; 192 + 193 193 /* 194 - * delalloc extents or blocks mapped above the highest 194 + * delalloc/unwritten extents or blocks mapped above the highest 195 195 * quota id shouldn't happen. 196 196 */ 197 - if (isnullstartblock(irec.br_startblock) || 197 + if (!xfs_bmap_is_written_extent(&irec) || 198 198 irec.br_startoff > max_dqid_off || 199 199 irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) { 200 200 xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,

+6 -6

fs/xfs/scrub/refcount.c

··· 127 127 * is healthy each rmap_irec we see will be in agbno order 128 128 * so we don't need insertion sort here. 129 129 */ 130 - frag = kmem_alloc(sizeof(struct xchk_refcnt_frag), 131 - KM_MAYFAIL); 130 + frag = kmalloc(sizeof(struct xchk_refcnt_frag), 131 + XCHK_GFP_FLAGS); 132 132 if (!frag) 133 133 return -ENOMEM; 134 134 memcpy(&frag->rm, rec, sizeof(frag->rm)); ··· 215 215 continue; 216 216 } 217 217 list_del(&frag->list); 218 - kmem_free(frag); 218 + kfree(frag); 219 219 nr++; 220 220 } 221 221 ··· 257 257 /* Delete fragments and work list. */ 258 258 list_for_each_entry_safe(frag, n, &worklist, list) { 259 259 list_del(&frag->list); 260 - kmem_free(frag); 260 + kfree(frag); 261 261 } 262 262 list_for_each_entry_safe(frag, n, &refchk->fragments, list) { 263 263 list_del(&frag->list); 264 - kmem_free(frag); 264 + kfree(frag); 265 265 } 266 266 } 267 267 ··· 306 306 out_free: 307 307 list_for_each_entry_safe(frag, n, &refchk.fragments, list) { 308 308 list_del(&frag->list); 309 - kmem_free(frag); 309 + kfree(frag); 310 310 } 311 311 } 312 312

+34 -17

fs/xfs/scrub/repair.c

··· 61 61 sc->flags |= XREP_ALREADY_FIXED; 62 62 return -EAGAIN; 63 63 case -EDEADLOCK: 64 - case -EAGAIN: 65 64 /* Tell the caller to try again having grabbed all the locks. */ 66 65 if (!(sc->flags & XCHK_TRY_HARDER)) { 67 66 sc->flags |= XCHK_TRY_HARDER; ··· 69 70 /* 70 71 * We tried harder but still couldn't grab all the resources 71 72 * we needed to fix it. The corruption has not been fixed, 72 - * so report back to userspace. 73 + * so exit to userspace with the scan's output flags unchanged. 73 74 */ 74 - return -EFSCORRUPTED; 75 + return 0; 75 76 default: 77 + /* 78 + * EAGAIN tells the caller to re-scrub, so we cannot return 79 + * that here. 80 + */ 81 + ASSERT(error != -EAGAIN); 76 82 return error; 77 83 } 78 84 } ··· 125 121 { 126 122 int error; 127 123 128 - /* Keep the AG header buffers locked so we can keep going. */ 129 - if (sc->sa.agi_bp) 124 + /* 125 + * Keep the AG header buffers locked while we roll the transaction. 126 + * Ensure that both AG buffers are dirty and held when we roll the 127 + * transaction so that they move forward in the log without losing the 128 + * bli (and hence the bli type) when the transaction commits. 129 + * 130 + * Normal code would never hold clean buffers across a roll, but repair 131 + * needs both buffers to maintain a total lock on the AG. 132 + */ 133 + if (sc->sa.agi_bp) { 134 + xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM); 130 135 xfs_trans_bhold(sc->tp, sc->sa.agi_bp); 131 - if (sc->sa.agf_bp) 136 + } 137 + 138 + if (sc->sa.agf_bp) { 139 + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM); 132 140 xfs_trans_bhold(sc->tp, sc->sa.agf_bp); 133 - if (sc->sa.agfl_bp) 134 - xfs_trans_bhold(sc->tp, sc->sa.agfl_bp); 141 + } 135 142 136 143 /* 137 - * Roll the transaction. We still own the buffer and the buffer lock 138 - * regardless of whether or not the roll succeeds. If the roll fails, 139 - * the buffers will be released during teardown on our way out of the 140 - * kernel. If it succeeds, we join them to the new transaction and 141 - * move on. 144 + * Roll the transaction. We still hold the AG header buffers locked 145 + * regardless of whether or not that succeeds. On failure, the buffers 146 + * will be released during teardown on our way out of the kernel. If 147 + * successful, join the buffers to the new transaction and move on. 142 148 */ 143 149 error = xfs_trans_roll(&sc->tp); 144 150 if (error) 145 151 return error; 146 152 147 - /* Join AG headers to the new transaction. */ 153 + /* Join the AG headers to the new transaction. */ 148 154 if (sc->sa.agi_bp) 149 155 xfs_trans_bjoin(sc->tp, sc->sa.agi_bp); 150 156 if (sc->sa.agf_bp) 151 157 xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); 152 - if (sc->sa.agfl_bp) 153 - xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp); 154 158 155 159 return 0; 156 160 } ··· 510 498 struct xfs_scrub *sc, 511 499 xfs_agblock_t agbno) 512 500 { 501 + struct xfs_buf *agfl_bp; 513 502 int error; 514 503 515 504 /* Make sure there's space on the freelist. */ ··· 529 516 return error; 530 517 531 518 /* Put the block on the AGFL. */ 519 + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); 520 + if (error) 521 + return error; 522 + 532 523 error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp, 533 - sc->sa.agfl_bp, agbno, 0); 524 + agfl_bp, agbno, 0); 534 525 if (error) 535 526 return error; 536 527 xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,

+3 -3

fs/xfs/scrub/scrub.c

··· 174 174 if (sc->flags & XCHK_REAPING_DISABLED) 175 175 xchk_start_reaping(sc); 176 176 if (sc->buf) { 177 - kmem_free(sc->buf); 177 + kvfree(sc->buf); 178 178 sc->buf = NULL; 179 179 } 180 180 return error; ··· 467 467 xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB, 468 468 "EXPERIMENTAL online scrub feature in use. Use at your own risk!"); 469 469 470 - sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL); 470 + sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS); 471 471 if (!sc) { 472 472 error = -ENOMEM; 473 473 goto out; ··· 557 557 out_teardown: 558 558 error = xchk_teardown(sc, error); 559 559 out_sc: 560 - kmem_free(sc); 560 + kfree(sc); 561 561 out: 562 562 trace_xchk_done(XFS_I(file_inode(file)), sm, error); 563 563 if (error == -EFSCORRUPTED || error == -EFSBADCRC) {

+9 -9

fs/xfs/scrub/scrub.h

··· 8 8 9 9 struct xfs_scrub; 10 10 11 + /* 12 + * Standard flags for allocating memory within scrub. NOFS context is 13 + * configured by the process allocation scope. Scrub and repair must be able 14 + * to back out gracefully if there isn't enough memory. Force-cast to avoid 15 + * complaints from static checkers. 16 + */ 17 + #define XCHK_GFP_FLAGS ((__force gfp_t)(GFP_KERNEL | __GFP_NOWARN | \ 18 + __GFP_RETRY_MAYFAIL)) 19 + 11 20 /* Type info and names for the scrub types. */ 12 21 enum xchk_type { 13 22 ST_NONE = 1, /* disabled */ ··· 48 39 49 40 /* AG btree roots */ 50 41 struct xfs_buf *agf_bp; 51 - struct xfs_buf *agfl_bp; 52 42 struct xfs_buf *agi_bp; 53 43 54 44 /* AG btrees */ ··· 168 160 #else 169 161 # define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0) 170 162 #endif 171 - 172 - struct xchk_fscounters { 173 - uint64_t icount; 174 - uint64_t ifree; 175 - uint64_t fdblocks; 176 - unsigned long long icount_min; 177 - unsigned long long icount_max; 178 - }; 179 163 180 164 #endif /* __XFS_SCRUB_SCRUB_H__ */

+1 -1

fs/xfs/scrub/symlink.c

··· 21 21 struct xfs_scrub *sc) 22 22 { 23 23 /* Allocate the buffer without the inode lock held. */ 24 - sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, GFP_KERNEL); 24 + sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, XCHK_GFP_FLAGS); 25 25 if (!sc->buf) 26 26 return -ENOMEM; 27 27

+19 -13

fs/xfs/xfs_aops.c

··· 17 17 #include "xfs_bmap.h" 18 18 #include "xfs_bmap_util.h" 19 19 #include "xfs_reflink.h" 20 + #include "xfs_errortag.h" 21 + #include "xfs_error.h" 20 22 21 23 struct xfs_writepage_ctx { 22 24 struct iomap_writepage_ctx ctx; ··· 116 114 if (unlikely(error)) { 117 115 if (ioend->io_flags & IOMAP_F_SHARED) { 118 116 xfs_reflink_cancel_cow_range(ip, offset, size, true); 119 - xfs_bmap_punch_delalloc_range(ip, 120 - XFS_B_TO_FSBT(mp, offset), 121 - XFS_B_TO_FSB(mp, size)); 117 + xfs_bmap_punch_delalloc_range(ip, offset, 118 + offset + size); 122 119 } 123 120 goto done; 124 121 } ··· 219 218 * checked (and found nothing at this offset) could have added 220 219 * overlapping blocks. 221 220 */ 222 - if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) 221 + if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) { 222 + trace_xfs_wb_data_iomap_invalid(ip, &wpc->iomap, 223 + XFS_WPC(wpc)->data_seq, XFS_DATA_FORK); 223 224 return false; 225 + } 224 226 if (xfs_inode_has_cow_data(ip) && 225 - XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) 227 + XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) { 228 + trace_xfs_wb_cow_iomap_invalid(ip, &wpc->iomap, 229 + XFS_WPC(wpc)->cow_seq, XFS_COW_FORK); 226 230 return false; 231 + } 227 232 return true; 228 233 } 229 234 ··· 292 285 293 286 if (xfs_is_shutdown(mp)) 294 287 return -EIO; 288 + 289 + XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS); 295 290 296 291 /* 297 292 * COW fork blocks can overlap data fork blocks even if the blocks ··· 382 373 isnullstartblock(imap.br_startblock)) 383 374 goto allocate_blocks; 384 375 385 - xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0); 376 + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq); 386 377 trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); 387 378 return 0; 388 379 allocate_blocks: ··· 464 455 struct folio *folio, 465 456 loff_t pos) 466 457 { 467 - struct inode *inode = folio->mapping->host; 468 - struct xfs_inode *ip = XFS_I(inode); 458 + struct xfs_inode *ip = XFS_I(folio->mapping->host); 469 459 struct xfs_mount *mp = ip->i_mount; 470 - size_t offset = offset_in_folio(folio, pos); 471 - xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, pos); 472 - xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, offset); 473 460 int error; 474 461 475 462 if (xfs_is_shutdown(mp)) ··· 475 470 "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.", 476 471 folio, ip->i_ino, pos); 477 472 478 - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 479 - i_blocks_per_folio(inode, folio) - pageoff_fsb); 473 + error = xfs_bmap_punch_delalloc_range(ip, pos, 474 + round_up(pos, folio_size(folio))); 475 + 480 476 if (error && !xfs_is_shutdown(mp)) 481 477 xfs_alert(mp, "page discard unable to remove delalloc mapping."); 482 478 }

+6 -4

fs/xfs/xfs_bmap_util.c

··· 590 590 int 591 591 xfs_bmap_punch_delalloc_range( 592 592 struct xfs_inode *ip, 593 - xfs_fileoff_t start_fsb, 594 - xfs_fileoff_t length) 593 + xfs_off_t start_byte, 594 + xfs_off_t end_byte) 595 595 { 596 + struct xfs_mount *mp = ip->i_mount; 596 597 struct xfs_ifork *ifp = &ip->i_df; 597 - xfs_fileoff_t end_fsb = start_fsb + length; 598 + xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte); 599 + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte); 598 600 struct xfs_bmbt_irec got, del; 599 601 struct xfs_iext_cursor icur; 600 602 int error = 0; ··· 609 607 610 608 while (got.br_startoff + got.br_blockcount > start_fsb) { 611 609 del = got; 612 - xfs_trim_extent(&del, start_fsb, length); 610 + xfs_trim_extent(&del, start_fsb, end_fsb - start_fsb); 613 611 614 612 /* 615 613 * A delete can push the cursor forward. Step back to the

+1 -1

fs/xfs/xfs_bmap_util.h

··· 31 31 #endif /* CONFIG_XFS_RT */ 32 32 33 33 int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, 34 - xfs_fileoff_t start_fsb, xfs_fileoff_t length); 34 + xfs_off_t start_byte, xfs_off_t end_byte); 35 35 36 36 struct kgetbmap { 37 37 __s64 bmv_offset; /* file offset of segment in blocks */

+1

fs/xfs/xfs_buf.c

··· 1945 1945 list_lru_destroy(&btp->bt_lru); 1946 1946 1947 1947 blkdev_issue_flush(btp->bt_bdev); 1948 + invalidate_bdev(btp->bt_bdev); 1948 1949 fs_put_dax(btp->bt_daxdev, btp->bt_mount); 1949 1950 1950 1951 kmem_free(btp);

+2

fs/xfs/xfs_buf_item.c

··· 1018 1018 trace_xfs_buf_item_relse(bp, _RET_IP_); 1019 1019 ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); 1020 1020 1021 + if (atomic_read(&bip->bli_refcount)) 1022 + return; 1021 1023 bp->b_log_item = NULL; 1022 1024 xfs_buf_rele(bp); 1023 1025 xfs_buf_item_free(bip);

+39 -7

fs/xfs/xfs_error.c

··· 46 46 XFS_RANDOM_REFCOUNT_FINISH_ONE, 47 47 XFS_RANDOM_BMAP_FINISH_ONE, 48 48 XFS_RANDOM_AG_RESV_CRITICAL, 49 - XFS_RANDOM_DROP_WRITES, 49 + 0, /* XFS_RANDOM_DROP_WRITES has been removed */ 50 50 XFS_RANDOM_LOG_BAD_CRC, 51 51 XFS_RANDOM_LOG_ITEM_PIN, 52 52 XFS_RANDOM_BUF_LRU_REF, ··· 60 60 XFS_RANDOM_LARP, 61 61 XFS_RANDOM_DA_LEAF_SPLIT, 62 62 XFS_RANDOM_ATTR_LEAF_TO_NODE, 63 + XFS_RANDOM_WB_DELAY_MS, 64 + XFS_RANDOM_WRITE_DELAY_MS, 63 65 }; 64 66 65 67 struct xfs_errortag_attr { ··· 164 162 XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE); 165 163 XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE); 166 164 XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); 167 - XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES); 168 165 XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); 169 166 XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); 170 167 XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); ··· 177 176 XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP); 178 177 XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); 179 178 XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); 179 + XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS); 180 + XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS); 180 181 181 182 static struct attribute *xfs_errortag_attrs[] = { 182 183 XFS_ERRORTAG_ATTR_LIST(noerror), ··· 209 206 XFS_ERRORTAG_ATTR_LIST(refcount_finish_one), 210 207 XFS_ERRORTAG_ATTR_LIST(bmap_finish_one), 211 208 XFS_ERRORTAG_ATTR_LIST(ag_resv_critical), 212 - XFS_ERRORTAG_ATTR_LIST(drop_writes), 213 209 XFS_ERRORTAG_ATTR_LIST(log_bad_crc), 214 210 XFS_ERRORTAG_ATTR_LIST(log_item_pin), 215 211 XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), ··· 222 220 XFS_ERRORTAG_ATTR_LIST(larp), 223 221 XFS_ERRORTAG_ATTR_LIST(da_leaf_split), 224 222 XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node), 223 + XFS_ERRORTAG_ATTR_LIST(wb_delay_ms), 224 + XFS_ERRORTAG_ATTR_LIST(write_delay_ms), 225 225 NULL, 226 226 }; 227 227 ATTRIBUTE_GROUPS(xfs_errortag); ··· 260 256 kmem_free(mp->m_errortag); 261 257 } 262 258 259 + static bool 260 + xfs_errortag_valid( 261 + unsigned int error_tag) 262 + { 263 + if (error_tag >= XFS_ERRTAG_MAX) 264 + return false; 265 + 266 + /* Error out removed injection types */ 267 + if (error_tag == XFS_ERRTAG_DROP_WRITES) 268 + return false; 269 + return true; 270 + } 271 + 272 + bool 273 + xfs_errortag_enabled( 274 + struct xfs_mount *mp, 275 + unsigned int tag) 276 + { 277 + if (!mp->m_errortag) 278 + return false; 279 + if (!xfs_errortag_valid(tag)) 280 + return false; 281 + 282 + return mp->m_errortag[tag] != 0; 283 + } 284 + 263 285 bool 264 286 xfs_errortag_test( 265 287 struct xfs_mount *mp, ··· 307 277 if (!mp->m_errortag) 308 278 return false; 309 279 310 - ASSERT(error_tag < XFS_ERRTAG_MAX); 280 + if (!xfs_errortag_valid(error_tag)) 281 + return false; 282 + 311 283 randfactor = mp->m_errortag[error_tag]; 312 284 if (!randfactor || get_random_u32_below(randfactor)) 313 285 return false; ··· 325 293 struct xfs_mount *mp, 326 294 unsigned int error_tag) 327 295 { 328 - if (error_tag >= XFS_ERRTAG_MAX) 296 + if (!xfs_errortag_valid(error_tag)) 329 297 return -EINVAL; 330 298 331 299 return mp->m_errortag[error_tag]; ··· 337 305 unsigned int error_tag, 338 306 unsigned int tag_value) 339 307 { 340 - if (error_tag >= XFS_ERRTAG_MAX) 308 + if (!xfs_errortag_valid(error_tag)) 341 309 return -EINVAL; 342 310 343 311 mp->m_errortag[error_tag] = tag_value; ··· 351 319 { 352 320 BUILD_BUG_ON(ARRAY_SIZE(xfs_errortag_random_default) != XFS_ERRTAG_MAX); 353 321 354 - if (error_tag >= XFS_ERRTAG_MAX) 322 + if (!xfs_errortag_valid(error_tag)) 355 323 return -EINVAL; 356 324 357 325 return xfs_errortag_set(mp, error_tag,

+13

fs/xfs/xfs_error.h

··· 45 45 const char *file, int line, unsigned int error_tag); 46 46 #define XFS_TEST_ERROR(expr, mp, tag) \ 47 47 ((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag))) 48 + bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag); 49 + #define XFS_ERRORTAG_DELAY(mp, tag) \ 50 + do { \ 51 + might_sleep(); \ 52 + if (!xfs_errortag_enabled((mp), (tag))) \ 53 + break; \ 54 + xfs_warn_ratelimited((mp), \ 55 + "Injecting %ums delay at file %s, line %d, on filesystem \"%s\"", \ 56 + (mp)->m_errortag[(tag)], __FILE__, __LINE__, \ 57 + (mp)->m_super->s_id); \ 58 + mdelay((mp)->m_errortag[(tag)]); \ 59 + } while (0) 48 60 49 61 extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag); 50 62 extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag, ··· 67 55 #define xfs_errortag_init(mp) (0) 68 56 #define xfs_errortag_del(mp) 69 57 #define XFS_TEST_ERROR(expr, mp, tag) (expr) 58 + #define XFS_ERRORTAG_DELAY(mp, tag) ((void)0) 70 59 #define xfs_errortag_set(mp, tag, val) (ENOSYS) 71 60 #define xfs_errortag_add(mp, tag) (ENOSYS) 72 61 #define xfs_errortag_clearall(mp) (ENOSYS)

+1 -1

fs/xfs/xfs_file.c

··· 1325 1325 if (write_fault) { 1326 1326 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1327 1327 ret = iomap_page_mkwrite(vmf, 1328 - &xfs_buffered_write_iomap_ops); 1328 + &xfs_page_mkwrite_iomap_ops); 1329 1329 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1330 1330 } else { 1331 1331 ret = filemap_fault(vmf);

+2 -2

fs/xfs/xfs_fsmap.c

··· 524 524 struct xfs_mount *mp = tp->t_mountp; 525 525 int error; 526 526 527 - xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED); 527 + xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); 528 528 529 529 /* 530 530 * Set up query parameters to return free rtextents covering the range ··· 551 551 if (error) 552 552 goto err; 553 553 err: 554 - xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED); 554 + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); 555 555 return error; 556 556 } 557 557

+6

fs/xfs/xfs_icache.c

··· 342 342 343 343 trace_xfs_iget_recycle(ip); 344 344 345 + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) 346 + return -EAGAIN; 347 + 345 348 /* 346 349 * We need to make it look like the inode is being reclaimed to prevent 347 350 * the actual reclaim workers from stomping over us while we recycle ··· 358 355 359 356 ASSERT(!rwsem_is_locked(&inode->i_rwsem)); 360 357 error = xfs_reinit_inode(mp, inode); 358 + xfs_iunlock(ip, XFS_ILOCK_EXCL); 361 359 if (error) { 362 360 /* 363 361 * Re-initializing the inode failed, and we are in deep ··· 522 518 if (ip->i_flags & XFS_IRECLAIMABLE) { 523 519 /* Drops i_flags_lock and RCU read lock. */ 524 520 error = xfs_iget_recycle(pag, ip); 521 + if (error == -EAGAIN) 522 + goto out_skip; 525 523 if (error) 526 524 return error; 527 525 } else {

+1 -1

fs/xfs/xfs_inode.c

··· 2479 2479 error = xfs_dir_replace(tp, ip, &xfs_name_dotdot, 2480 2480 tp->t_mountp->m_sb.sb_rootino, 0); 2481 2481 if (error) 2482 - return error; 2482 + goto out_trans_cancel; 2483 2483 } 2484 2484 } else { 2485 2485 /*

+114 -71

fs/xfs/xfs_iomap.c

··· 48 48 return -EFSCORRUPTED; 49 49 } 50 50 51 + u64 52 + xfs_iomap_inode_sequence( 53 + struct xfs_inode *ip, 54 + u16 iomap_flags) 55 + { 56 + u64 cookie = 0; 57 + 58 + if (iomap_flags & IOMAP_F_XATTR) 59 + return READ_ONCE(ip->i_af.if_seq); 60 + if ((iomap_flags & IOMAP_F_SHARED) && ip->i_cowfp) 61 + cookie = (u64)READ_ONCE(ip->i_cowfp->if_seq) << 32; 62 + return cookie | READ_ONCE(ip->i_df.if_seq); 63 + } 64 + 65 + /* 66 + * Check that the iomap passed to us is still valid for the given offset and 67 + * length. 68 + */ 69 + static bool 70 + xfs_iomap_valid( 71 + struct inode *inode, 72 + const struct iomap *iomap) 73 + { 74 + struct xfs_inode *ip = XFS_I(inode); 75 + 76 + if (iomap->validity_cookie != 77 + xfs_iomap_inode_sequence(ip, iomap->flags)) { 78 + trace_xfs_iomap_invalid(ip, iomap); 79 + return false; 80 + } 81 + 82 + XFS_ERRORTAG_DELAY(ip->i_mount, XFS_ERRTAG_WRITE_DELAY_MS); 83 + return true; 84 + } 85 + 86 + const struct iomap_page_ops xfs_iomap_page_ops = { 87 + .iomap_valid = xfs_iomap_valid, 88 + }; 89 + 51 90 int 52 91 xfs_bmbt_to_iomap( 53 92 struct xfs_inode *ip, 54 93 struct iomap *iomap, 55 94 struct xfs_bmbt_irec *imap, 56 95 unsigned int mapping_flags, 57 - u16 iomap_flags) 96 + u16 iomap_flags, 97 + u64 sequence_cookie) 58 98 { 59 99 struct xfs_mount *mp = ip->i_mount; 60 100 struct xfs_buftarg *target = xfs_inode_buftarg(ip); ··· 131 91 if (xfs_ipincount(ip) && 132 92 (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) 133 93 iomap->flags |= IOMAP_F_DIRTY; 94 + 95 + iomap->validity_cookie = sequence_cookie; 96 + iomap->page_ops = &xfs_iomap_page_ops; 134 97 return 0; 135 98 } 136 99 ··· 238 195 xfs_fileoff_t offset_fsb, 239 196 xfs_fileoff_t count_fsb, 240 197 unsigned int flags, 241 - struct xfs_bmbt_irec *imap) 198 + struct xfs_bmbt_irec *imap, 199 + u64 *seq) 242 200 { 243 201 struct xfs_mount *mp = ip->i_mount; 244 202 struct xfs_trans *tp; ··· 329 285 error = xfs_alert_fsblock_zero(ip, imap); 330 286 331 287 out_unlock: 288 + *seq = xfs_iomap_inode_sequence(ip, 0); 332 289 xfs_iunlock(ip, XFS_ILOCK_EXCL); 333 290 return error; 334 291 ··· 788 743 bool shared = false; 789 744 u16 iomap_flags = 0; 790 745 unsigned int lockmode = XFS_ILOCK_SHARED; 746 + u64 seq; 791 747 792 748 ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); 793 749 ··· 857 811 goto out_unlock; 858 812 } 859 813 814 + seq = xfs_iomap_inode_sequence(ip, iomap_flags); 860 815 xfs_iunlock(ip, lockmode); 861 816 trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); 862 - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags); 817 + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq); 863 818 864 819 allocate_blocks: 865 820 error = -EAGAIN; ··· 886 839 xfs_iunlock(ip, lockmode); 887 840 888 841 error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb, 889 - flags, &imap); 842 + flags, &imap, &seq); 890 843 if (error) 891 844 return error; 892 845 893 846 trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); 894 847 return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 895 - iomap_flags | IOMAP_F_NEW); 848 + iomap_flags | IOMAP_F_NEW, seq); 896 849 897 850 out_found_cow: 898 - xfs_iunlock(ip, lockmode); 899 851 length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); 900 852 trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); 901 853 if (imap.br_startblock != HOLESTARTBLOCK) { 902 - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); 854 + seq = xfs_iomap_inode_sequence(ip, 0); 855 + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); 903 856 if (error) 904 - return error; 857 + goto out_unlock; 905 858 } 906 - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED); 859 + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); 860 + xfs_iunlock(ip, lockmode); 861 + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); 907 862 908 863 out_unlock: 909 864 if (lockmode) ··· 964 915 int allocfork = XFS_DATA_FORK; 965 916 int error = 0; 966 917 unsigned int lockmode = XFS_ILOCK_EXCL; 918 + u64 seq; 967 919 968 920 if (xfs_is_shutdown(mp)) 969 921 return -EIO; ··· 975 925 flags, iomap, srcmap); 976 926 977 927 ASSERT(!XFS_IS_REALTIME_INODE(ip)); 928 + 929 + error = xfs_qm_dqattach(ip); 930 + if (error) 931 + return error; 978 932 979 933 error = xfs_ilock_for_iomap(ip, flags, &lockmode); 980 934 if (error) ··· 1083 1029 allocfork = XFS_COW_FORK; 1084 1030 } 1085 1031 1086 - error = xfs_qm_dqattach_locked(ip, false); 1087 - if (error) 1088 - goto out_unlock; 1089 - 1090 1032 if (eof && offset + count > XFS_ISIZE(ip)) { 1091 1033 /* 1092 1034 * Determine the initial size of the preallocation. ··· 1144 1094 * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch 1145 1095 * them out if the write happens to fail. 1146 1096 */ 1097 + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); 1147 1098 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1148 1099 trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); 1149 - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW); 1100 + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq); 1150 1101 1151 1102 found_imap: 1103 + seq = xfs_iomap_inode_sequence(ip, 0); 1152 1104 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1153 - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); 1105 + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); 1154 1106 1155 1107 found_cow: 1156 - xfs_iunlock(ip, XFS_ILOCK_EXCL); 1108 + seq = xfs_iomap_inode_sequence(ip, 0); 1157 1109 if (imap.br_startoff <= offset_fsb) { 1158 - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); 1110 + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); 1159 1111 if (error) 1160 - return error; 1112 + goto out_unlock; 1113 + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); 1114 + xfs_iunlock(ip, XFS_ILOCK_EXCL); 1161 1115 return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 1162 - IOMAP_F_SHARED); 1116 + IOMAP_F_SHARED, seq); 1163 1117 } 1164 1118 1165 1119 xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); 1166 - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0); 1120 + xfs_iunlock(ip, XFS_ILOCK_EXCL); 1121 + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); 1167 1122 1168 1123 out_unlock: 1169 1124 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1170 1125 return error; 1126 + } 1127 + 1128 + static int 1129 + xfs_buffered_write_delalloc_punch( 1130 + struct inode *inode, 1131 + loff_t offset, 1132 + loff_t length) 1133 + { 1134 + return xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, 1135 + offset + length); 1171 1136 } 1172 1137 1173 1138 static int ··· 1194 1129 unsigned flags, 1195 1130 struct iomap *iomap) 1196 1131 { 1197 - struct xfs_inode *ip = XFS_I(inode); 1198 - struct xfs_mount *mp = ip->i_mount; 1199 - xfs_fileoff_t start_fsb; 1200 - xfs_fileoff_t end_fsb; 1201 - int error = 0; 1202 1132 1203 - if (iomap->type != IOMAP_DELALLOC) 1204 - return 0; 1133 + struct xfs_mount *mp = XFS_M(inode->i_sb); 1134 + int error; 1205 1135 1206 - /* 1207 - * Behave as if the write failed if drop writes is enabled. Set the NEW 1208 - * flag to force delalloc cleanup. 1209 - */ 1210 - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DROP_WRITES)) { 1211 - iomap->flags |= IOMAP_F_NEW; 1212 - written = 0; 1136 + error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset, 1137 + length, written, &xfs_buffered_write_delalloc_punch); 1138 + if (error && !xfs_is_shutdown(mp)) { 1139 + xfs_alert(mp, "%s: unable to clean up ino 0x%llx", 1140 + __func__, XFS_I(inode)->i_ino); 1141 + return error; 1213 1142 } 1214 - 1215 - /* 1216 - * start_fsb refers to the first unused block after a short write. If 1217 - * nothing was written, round offset down to point at the first block in 1218 - * the range. 1219 - */ 1220 - if (unlikely(!written)) 1221 - start_fsb = XFS_B_TO_FSBT(mp, offset); 1222 - else 1223 - start_fsb = XFS_B_TO_FSB(mp, offset + written); 1224 - end_fsb = XFS_B_TO_FSB(mp, offset + length); 1225 - 1226 - /* 1227 - * Trim delalloc blocks if they were allocated by this write and we 1228 - * didn't manage to write the whole range. 1229 - * 1230 - * We don't need to care about racing delalloc as we hold i_mutex 1231 - * across the reserve/allocate/unreserve calls. If there are delalloc 1232 - * blocks in the range, they are ours. 1233 - */ 1234 - if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) { 1235 - truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), 1236 - XFS_FSB_TO_B(mp, end_fsb) - 1); 1237 - 1238 - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1239 - end_fsb - start_fsb); 1240 - if (error && !xfs_is_shutdown(mp)) { 1241 - xfs_alert(mp, "%s: unable to clean up ino %lld", 1242 - __func__, ip->i_ino); 1243 - return error; 1244 - } 1245 - } 1246 - 1247 1143 return 0; 1248 1144 } 1249 1145 1250 1146 const struct iomap_ops xfs_buffered_write_iomap_ops = { 1251 1147 .iomap_begin = xfs_buffered_write_iomap_begin, 1252 1148 .iomap_end = xfs_buffered_write_iomap_end, 1149 + }; 1150 + 1151 + /* 1152 + * iomap_page_mkwrite() will never fail in a way that requires delalloc extents 1153 + * that it allocated to be revoked. Hence we do not need an .iomap_end method 1154 + * for this operation. 1155 + */ 1156 + const struct iomap_ops xfs_page_mkwrite_iomap_ops = { 1157 + .iomap_begin = xfs_buffered_write_iomap_begin, 1253 1158 }; 1254 1159 1255 1160 static int ··· 1239 1204 int nimaps = 1, error = 0; 1240 1205 bool shared = false; 1241 1206 unsigned int lockmode = XFS_ILOCK_SHARED; 1207 + u64 seq; 1242 1208 1243 1209 ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); 1244 1210 ··· 1253 1217 &nimaps, 0); 1254 1218 if (!error && ((flags & IOMAP_REPORT) || IS_DAX(inode))) 1255 1219 error = xfs_reflink_trim_around_shared(ip, &imap, &shared); 1220 + seq = xfs_iomap_inode_sequence(ip, shared ? IOMAP_F_SHARED : 0); 1256 1221 xfs_iunlock(ip, lockmode); 1257 1222 1258 1223 if (error) 1259 1224 return error; 1260 1225 trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); 1261 1226 return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 1262 - shared ? IOMAP_F_SHARED : 0); 1227 + shared ? IOMAP_F_SHARED : 0, seq); 1263 1228 } 1264 1229 1265 1230 const struct iomap_ops xfs_read_iomap_ops = { ··· 1285 1248 struct xfs_bmbt_irec imap, cmap; 1286 1249 int error = 0; 1287 1250 unsigned lockmode; 1251 + u64 seq; 1288 1252 1289 1253 if (xfs_is_shutdown(mp)) 1290 1254 return -EIO; ··· 1320 1282 if (data_fsb < cow_fsb + cmap.br_blockcount) 1321 1283 end_fsb = min(end_fsb, data_fsb); 1322 1284 xfs_trim_extent(&cmap, offset_fsb, end_fsb); 1285 + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); 1323 1286 error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 1324 - IOMAP_F_SHARED); 1287 + IOMAP_F_SHARED, seq); 1325 1288 /* 1326 1289 * This is a COW extent, so we must probe the page cache 1327 1290 * because there could be dirty page cache being backed ··· 1343 1304 imap.br_startblock = HOLESTARTBLOCK; 1344 1305 imap.br_state = XFS_EXT_NORM; 1345 1306 done: 1307 + seq = xfs_iomap_inode_sequence(ip, 0); 1346 1308 xfs_trim_extent(&imap, offset_fsb, end_fsb); 1347 - error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); 1309 + error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); 1348 1310 out_unlock: 1349 1311 xfs_iunlock(ip, lockmode); 1350 1312 return error; ··· 1371 1331 struct xfs_bmbt_irec imap; 1372 1332 int nimaps = 1, error = 0; 1373 1333 unsigned lockmode; 1334 + int seq; 1374 1335 1375 1336 if (xfs_is_shutdown(mp)) 1376 1337 return -EIO; ··· 1388 1347 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, 1389 1348 &nimaps, XFS_BMAPI_ATTRFORK); 1390 1349 out_unlock: 1350 + 1351 + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_XATTR); 1391 1352 xfs_iunlock(ip, lockmode); 1392 1353 1393 1354 if (error) 1394 1355 return error; 1395 1356 ASSERT(nimaps); 1396 - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); 1357 + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_XATTR, seq); 1397 1358 } 1398 1359 1399 1360 const struct iomap_ops xfs_xattr_iomap_ops = {

+4 -2

fs/xfs/xfs_iomap.h

··· 13 13 14 14 int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, 15 15 xfs_fileoff_t count_fsb, unsigned int flags, 16 - struct xfs_bmbt_irec *imap); 16 + struct xfs_bmbt_irec *imap, u64 *sequence); 17 17 int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); 18 18 xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip, 19 19 xfs_fileoff_t end_fsb); 20 20 21 + u64 xfs_iomap_inode_sequence(struct xfs_inode *ip, u16 iomap_flags); 21 22 int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap, 22 23 struct xfs_bmbt_irec *imap, unsigned int mapping_flags, 23 - u16 iomap_flags); 24 + u16 iomap_flags, u64 sequence_cookie); 24 25 25 26 int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len, 26 27 bool *did_zero); ··· 48 47 } 49 48 50 49 extern const struct iomap_ops xfs_buffered_write_iomap_ops; 50 + extern const struct iomap_ops xfs_page_mkwrite_iomap_ops; 51 51 extern const struct iomap_ops xfs_direct_write_iomap_ops; 52 52 extern const struct iomap_ops xfs_read_iomap_ops; 53 53 extern const struct iomap_ops xfs_seek_iomap_ops;

+31 -15

fs/xfs/xfs_log.c

··· 644 644 int min_logfsbs; 645 645 646 646 if (!xfs_has_norecovery(mp)) { 647 - xfs_notice(mp, "Mounting V%d Filesystem", 648 - XFS_SB_VERSION_NUM(&mp->m_sb)); 647 + xfs_notice(mp, "Mounting V%d Filesystem %pU", 648 + XFS_SB_VERSION_NUM(&mp->m_sb), 649 + &mp->m_sb.sb_uuid); 649 650 } else { 650 651 xfs_notice(mp, 651 - "Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", 652 - XFS_SB_VERSION_NUM(&mp->m_sb)); 652 + "Mounting V%d filesystem %pU in no-recovery mode. Filesystem will be inconsistent.", 653 + XFS_SB_VERSION_NUM(&mp->m_sb), 654 + &mp->m_sb.sb_uuid); 653 655 ASSERT(xfs_is_readonly(mp)); 654 656 } 655 657 ··· 889 887 } 890 888 891 889 /* 890 + * Cycle all the iclogbuf locks to make sure all log IO completion 891 + * is done before we tear down these buffers. 892 + */ 893 + static void 894 + xlog_wait_iclog_completion(struct xlog *log) 895 + { 896 + int i; 897 + struct xlog_in_core *iclog = log->l_iclog; 898 + 899 + for (i = 0; i < log->l_iclog_bufs; i++) { 900 + down(&iclog->ic_sema); 901 + up(&iclog->ic_sema); 902 + iclog = iclog->ic_next; 903 + } 904 + } 905 + 906 + /* 892 907 * Wait for the iclog and all prior iclogs to be written disk as required by the 893 908 * log force state machine. Waiting on ic_force_wait ensures iclog completions 894 909 * have been ordered and callbacks run before we are woken here, hence ··· 1129 1110 struct xfs_mount *mp) 1130 1111 { 1131 1112 xfs_log_clean(mp); 1113 + 1114 + /* 1115 + * If shutdown has come from iclog IO context, the log 1116 + * cleaning will have been skipped and so we need to wait 1117 + * for the iclog to complete shutdown processing before we 1118 + * tear anything down. 1119 + */ 1120 + xlog_wait_iclog_completion(mp->m_log); 1132 1121 1133 1122 xfs_buftarg_drain(mp->m_ddev_targp); 1134 1123 ··· 2139 2112 { 2140 2113 xlog_in_core_t *iclog, *next_iclog; 2141 2114 int i; 2142 - 2143 - /* 2144 - * Cycle all the iclogbuf locks to make sure all log IO completion 2145 - * is done before we tear down these buffers. 2146 - */ 2147 - iclog = log->l_iclog; 2148 - for (i = 0; i < log->l_iclog_bufs; i++) { 2149 - down(&iclog->ic_sema); 2150 - up(&iclog->ic_sema); 2151 - iclog = iclog->ic_next; 2152 - } 2153 2115 2154 2116 /* 2155 2117 * Destroy the CIL after waiting for iclog IO completion because an

+15

fs/xfs/xfs_mount.c

··· 538 538 return 0; 539 539 } 540 540 541 + static void 542 + xfs_unmount_check( 543 + struct xfs_mount *mp) 544 + { 545 + if (xfs_is_shutdown(mp)) 546 + return; 547 + 548 + if (percpu_counter_sum(&mp->m_ifree) > 549 + percpu_counter_sum(&mp->m_icount)) { 550 + xfs_alert(mp, "ifree/icount mismatch at unmount"); 551 + xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); 552 + } 553 + } 554 + 541 555 /* 542 556 * Flush and reclaim dirty inodes in preparation for unmount. Inodes and 543 557 * internal inode structures can be sitting in the CIL and AIL at this point, ··· 1091 1077 if (error) 1092 1078 xfs_warn(mp, "Unable to free reserved block pool. " 1093 1079 "Freespace may not be correct on next mount."); 1080 + xfs_unmount_check(mp); 1094 1081 1095 1082 xfs_log_unmount(mp); 1096 1083 xfs_da_unmount(mp);

+4 -2

fs/xfs/xfs_pnfs.c

··· 125 125 int nimaps = 1; 126 126 uint lock_flags; 127 127 int error = 0; 128 + u64 seq; 128 129 129 130 if (xfs_is_shutdown(mp)) 130 131 return -EIO; ··· 177 176 lock_flags = xfs_ilock_data_map_shared(ip); 178 177 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, 179 178 &imap, &nimaps, bmapi_flags); 179 + seq = xfs_iomap_inode_sequence(ip, 0); 180 180 181 181 ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK); 182 182 ··· 191 189 xfs_iunlock(ip, lock_flags); 192 190 193 191 error = xfs_iomap_write_direct(ip, offset_fsb, 194 - end_fsb - offset_fsb, 0, &imap); 192 + end_fsb - offset_fsb, 0, &imap, &seq); 195 193 if (error) 196 194 goto out_unlock; 197 195 ··· 211 209 } 212 210 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 213 211 214 - error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0); 212 + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq); 215 213 *device_generation = mp->m_generation; 216 214 return error; 217 215 out_unlock:

+12 -4

fs/xfs/xfs_qm.c

··· 423 423 goto out_miss_busy; 424 424 425 425 /* 426 + * If something else is freeing this dquot and hasn't yet removed it 427 + * from the LRU, leave it for the freeing task to complete the freeing 428 + * process rather than risk it being free from under us here. 429 + */ 430 + if (dqp->q_flags & XFS_DQFLAG_FREEING) 431 + goto out_miss_unlock; 432 + 433 + /* 426 434 * This dquot has acquired a reference in the meantime remove it from 427 435 * the freelist and try again. 428 436 */ ··· 449 441 * skip it so there is time for the IO to complete before we try to 450 442 * reclaim it again on the next LRU pass. 451 443 */ 452 - if (!xfs_dqflock_nowait(dqp)) { 453 - xfs_dqunlock(dqp); 454 - goto out_miss_busy; 455 - } 444 + if (!xfs_dqflock_nowait(dqp)) 445 + goto out_miss_unlock; 456 446 457 447 if (XFS_DQ_IS_DIRTY(dqp)) { 458 448 struct xfs_buf *bp = NULL; ··· 484 478 XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims); 485 479 return LRU_REMOVED; 486 480 481 + out_miss_unlock: 482 + xfs_dqunlock(dqp); 487 483 out_miss_busy: 488 484 trace_xfs_dqreclaim_busy(dqp); 489 485 XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);

+54 -6

fs/xfs/xfs_rtalloc.c

··· 1311 1311 uint64_t val = 0; 1312 1312 int error; 1313 1313 1314 - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); 1314 + xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); 1315 1315 error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent, 1316 1316 &val); 1317 - xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL); 1317 + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); 1318 1318 if (error) 1319 1319 return error; 1320 1320 ··· 1323 1323 spin_unlock(&mp->m_sb_lock); 1324 1324 percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); 1325 1325 return 0; 1326 + } 1327 + 1328 + /* 1329 + * Read in the bmbt of an rt metadata inode so that we never have to load them 1330 + * at runtime. This enables the use of shared ILOCKs for rtbitmap scans. Use 1331 + * an empty transaction to avoid deadlocking on loops in the bmbt. 1332 + */ 1333 + static inline int 1334 + xfs_rtmount_iread_extents( 1335 + struct xfs_inode *ip, 1336 + unsigned int lock_class) 1337 + { 1338 + struct xfs_trans *tp; 1339 + int error; 1340 + 1341 + error = xfs_trans_alloc_empty(ip->i_mount, &tp); 1342 + if (error) 1343 + return error; 1344 + 1345 + xfs_ilock(ip, XFS_ILOCK_EXCL | lock_class); 1346 + 1347 + error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); 1348 + if (error) 1349 + goto out_unlock; 1350 + 1351 + if (xfs_inode_has_attr_fork(ip)) { 1352 + error = xfs_iread_extents(tp, ip, XFS_ATTR_FORK); 1353 + if (error) 1354 + goto out_unlock; 1355 + } 1356 + 1357 + out_unlock: 1358 + xfs_iunlock(ip, XFS_ILOCK_EXCL | lock_class); 1359 + xfs_trans_cancel(tp); 1360 + return error; 1326 1361 } 1327 1362 1328 1363 /* ··· 1377 1342 return error; 1378 1343 ASSERT(mp->m_rbmip != NULL); 1379 1344 1345 + error = xfs_rtmount_iread_extents(mp->m_rbmip, XFS_ILOCK_RTBITMAP); 1346 + if (error) 1347 + goto out_rele_bitmap; 1348 + 1380 1349 error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip); 1381 - if (error) { 1382 - xfs_irele(mp->m_rbmip); 1383 - return error; 1384 - } 1350 + if (error) 1351 + goto out_rele_bitmap; 1385 1352 ASSERT(mp->m_rsumip != NULL); 1353 + 1354 + error = xfs_rtmount_iread_extents(mp->m_rsumip, XFS_ILOCK_RTSUM); 1355 + if (error) 1356 + goto out_rele_summary; 1357 + 1386 1358 xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks); 1387 1359 return 0; 1360 + 1361 + out_rele_summary: 1362 + xfs_irele(mp->m_rsumip); 1363 + out_rele_bitmap: 1364 + xfs_irele(mp->m_rbmip); 1365 + return error; 1388 1366 } 1389 1367 1390 1368 void

+1 -1

fs/xfs/xfs_super.c

··· 1110 1110 if (!sb->s_fs_info) 1111 1111 return; 1112 1112 1113 - xfs_notice(mp, "Unmounting Filesystem"); 1113 + xfs_notice(mp, "Unmounting Filesystem %pU", &mp->m_sb.sb_uuid); 1114 1114 xfs_filestream_unmount(mp); 1115 1115 xfs_unmountfs(mp); 1116 1116

+2

fs/xfs/xfs_trace.c

··· 34 34 #include "xfs_ag.h" 35 35 #include "xfs_ag_resv.h" 36 36 #include "xfs_error.h" 37 + #include <linux/iomap.h> 38 + #include "xfs_iomap.h" 37 39 38 40 /* 39 41 * We include this last to have the helpers above available for the trace

+86

fs/xfs/xfs_trace.h

··· 3352 3352 TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), \ 3353 3353 TP_ARGS(ip, irec)) 3354 3354 3355 + /* inode iomap invalidation events */ 3356 + DECLARE_EVENT_CLASS(xfs_wb_invalid_class, 3357 + TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap, unsigned int wpcseq, int whichfork), 3358 + TP_ARGS(ip, iomap, wpcseq, whichfork), 3359 + TP_STRUCT__entry( 3360 + __field(dev_t, dev) 3361 + __field(xfs_ino_t, ino) 3362 + __field(u64, addr) 3363 + __field(loff_t, pos) 3364 + __field(u64, len) 3365 + __field(u16, type) 3366 + __field(u16, flags) 3367 + __field(u32, wpcseq) 3368 + __field(u32, forkseq) 3369 + ), 3370 + TP_fast_assign( 3371 + __entry->dev = VFS_I(ip)->i_sb->s_dev; 3372 + __entry->ino = ip->i_ino; 3373 + __entry->addr = iomap->addr; 3374 + __entry->pos = iomap->offset; 3375 + __entry->len = iomap->length; 3376 + __entry->type = iomap->type; 3377 + __entry->flags = iomap->flags; 3378 + __entry->wpcseq = wpcseq; 3379 + __entry->forkseq = READ_ONCE(xfs_ifork_ptr(ip, whichfork)->if_seq); 3380 + ), 3381 + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x wpcseq 0x%x forkseq 0x%x", 3382 + MAJOR(__entry->dev), MINOR(__entry->dev), 3383 + __entry->ino, 3384 + __entry->pos, 3385 + __entry->addr, 3386 + __entry->len, 3387 + __entry->type, 3388 + __entry->flags, 3389 + __entry->wpcseq, 3390 + __entry->forkseq) 3391 + ); 3392 + #define DEFINE_WB_INVALID_EVENT(name) \ 3393 + DEFINE_EVENT(xfs_wb_invalid_class, name, \ 3394 + TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap, unsigned int wpcseq, int whichfork), \ 3395 + TP_ARGS(ip, iomap, wpcseq, whichfork)) 3396 + DEFINE_WB_INVALID_EVENT(xfs_wb_cow_iomap_invalid); 3397 + DEFINE_WB_INVALID_EVENT(xfs_wb_data_iomap_invalid); 3398 + 3399 + DECLARE_EVENT_CLASS(xfs_iomap_invalid_class, 3400 + TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap), 3401 + TP_ARGS(ip, iomap), 3402 + TP_STRUCT__entry( 3403 + __field(dev_t, dev) 3404 + __field(xfs_ino_t, ino) 3405 + __field(u64, addr) 3406 + __field(loff_t, pos) 3407 + __field(u64, len) 3408 + __field(u64, validity_cookie) 3409 + __field(u64, inodeseq) 3410 + __field(u16, type) 3411 + __field(u16, flags) 3412 + ), 3413 + TP_fast_assign( 3414 + __entry->dev = VFS_I(ip)->i_sb->s_dev; 3415 + __entry->ino = ip->i_ino; 3416 + __entry->addr = iomap->addr; 3417 + __entry->pos = iomap->offset; 3418 + __entry->len = iomap->length; 3419 + __entry->validity_cookie = iomap->validity_cookie; 3420 + __entry->type = iomap->type; 3421 + __entry->flags = iomap->flags; 3422 + __entry->inodeseq = xfs_iomap_inode_sequence(ip, iomap->flags); 3423 + ), 3424 + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x validity_cookie 0x%llx inodeseq 0x%llx", 3425 + MAJOR(__entry->dev), MINOR(__entry->dev), 3426 + __entry->ino, 3427 + __entry->pos, 3428 + __entry->addr, 3429 + __entry->len, 3430 + __entry->type, 3431 + __entry->flags, 3432 + __entry->validity_cookie, 3433 + __entry->inodeseq) 3434 + ); 3435 + #define DEFINE_IOMAP_INVALID_EVENT(name) \ 3436 + DEFINE_EVENT(xfs_iomap_invalid_class, name, \ 3437 + TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap), \ 3438 + TP_ARGS(ip, iomap)) 3439 + DEFINE_IOMAP_INVALID_EVENT(xfs_iomap_invalid); 3440 + 3355 3441 /* refcount/reflink tracepoint definitions */ 3356 3442 3357 3443 /* reflink tracepoints */

+3 -1

fs/xfs/xfs_trans_ail.c

··· 422 422 struct xfs_ail_cursor cur; 423 423 struct xfs_log_item *lip; 424 424 xfs_lsn_t lsn; 425 - xfs_lsn_t target; 425 + xfs_lsn_t target = NULLCOMMITLSN; 426 426 long tout; 427 427 int stuck = 0; 428 428 int flushing = 0; ··· 471 471 goto out_done; 472 472 473 473 XFS_STATS_INC(mp, xs_push_ail); 474 + 475 + ASSERT(target != NULLCOMMITLSN); 474 476 475 477 lsn = lip->li_lsn; 476 478 while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {

+1 -1

fs/xfs/xfs_xattr.c

··· 210 210 return; 211 211 } 212 212 offset = context->buffer + context->count; 213 - strncpy(offset, prefix, prefix_len); 213 + memcpy(offset, prefix, prefix_len); 214 214 offset += prefix_len; 215 215 strncpy(offset, (char *)name, namelen); /* real name */ 216 216 offset += namelen;

+39 -8

include/linux/iomap.h

··· 49 49 * 50 50 * IOMAP_F_BUFFER_HEAD indicates that the file system requires the use of 51 51 * buffer heads for this mapping. 52 + * 53 + * IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent 54 + * rather than a file data extent. 52 55 */ 53 - #define IOMAP_F_NEW 0x01 54 - #define IOMAP_F_DIRTY 0x02 55 - #define IOMAP_F_SHARED 0x04 56 - #define IOMAP_F_MERGED 0x08 57 - #define IOMAP_F_BUFFER_HEAD 0x10 58 - #define IOMAP_F_ZONE_APPEND 0x20 56 + #define IOMAP_F_NEW (1U << 0) 57 + #define IOMAP_F_DIRTY (1U << 1) 58 + #define IOMAP_F_SHARED (1U << 2) 59 + #define IOMAP_F_MERGED (1U << 3) 60 + #define IOMAP_F_BUFFER_HEAD (1U << 4) 61 + #define IOMAP_F_ZONE_APPEND (1U << 5) 62 + #define IOMAP_F_XATTR (1U << 6) 59 63 60 64 /* 61 65 * Flags set by the core iomap code during operations: 62 66 * 63 67 * IOMAP_F_SIZE_CHANGED indicates to the iomap_end method that the file size 64 68 * has changed as the result of this write operation. 69 + * 70 + * IOMAP_F_STALE indicates that the iomap is not valid any longer and the file 71 + * range it covers needs to be remapped by the high level before the operation 72 + * can proceed. 65 73 */ 66 - #define IOMAP_F_SIZE_CHANGED 0x100 74 + #define IOMAP_F_SIZE_CHANGED (1U << 8) 75 + #define IOMAP_F_STALE (1U << 9) 67 76 68 77 /* 69 78 * Flags from 0x1000 up are for file system specific usage: 70 79 */ 71 - #define IOMAP_F_PRIVATE 0x1000 80 + #define IOMAP_F_PRIVATE (1U << 12) 72 81 73 82 74 83 /* ··· 98 89 void *inline_data; 99 90 void *private; /* filesystem private */ 100 91 const struct iomap_page_ops *page_ops; 92 + u64 validity_cookie; /* used with .iomap_valid() */ 101 93 }; 102 94 103 95 static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) ··· 138 128 int (*page_prepare)(struct inode *inode, loff_t pos, unsigned len); 139 129 void (*page_done)(struct inode *inode, loff_t pos, unsigned copied, 140 130 struct page *page); 131 + 132 + /* 133 + * Check that the cached iomap still maps correctly to the filesystem's 134 + * internal extent map. FS internal extent maps can change while iomap 135 + * is iterating a cached iomap, so this hook allows iomap to detect that 136 + * the iomap needs to be refreshed during a long running write 137 + * operation. 138 + * 139 + * The filesystem can store internal state (e.g. a sequence number) in 140 + * iomap->validity_cookie when the iomap is first mapped to be able to 141 + * detect changes between mapping time and whenever .iomap_valid() is 142 + * called. 143 + * 144 + * This is called with the folio over the specified file position held 145 + * locked by the iomap code. 146 + */ 147 + bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap); 141 148 }; 142 149 143 150 /* ··· 253 226 254 227 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, 255 228 const struct iomap_ops *ops); 229 + int iomap_file_buffered_write_punch_delalloc(struct inode *inode, 230 + struct iomap *iomap, loff_t pos, loff_t length, ssize_t written, 231 + int (*punch)(struct inode *inode, loff_t pos, loff_t length)); 232 + 256 233 int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); 257 234 void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); 258 235 bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);