Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'vfs-6.16-rc1.iomap' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull iomap updates from Christian Brauner:

- More fallout and preparatory work associated with the folio batch
prototype posted a while back.

Mainly this just cleans up some of the helpers and pushes some
pos/len trimming further down in the write begin path.

- Add missing flag descriptions to the iomap documentation

* tag 'vfs-6.16-rc1.iomap' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
iomap: rework iomap_write_begin() to return folio offset and length
iomap: push non-large folio check into get folio path
iomap: helper to trim pos/bytes to within folio
iomap: drop pos param from __iomap_[get|put]_folio()
iomap: drop unnecessary pos param from iomap_write_[begin|end]
iomap: resample iter->pos after iomap_write_begin() calls
iomap: trace: Add missing flags to [IOMAP_|IOMAP_F_]FLAGS_STRINGS
Documentation: iomap: Add missing flags description

+93 -50
+14 -2
Documentation/filesystems/iomap/design.rst
··· 243 243 regular file data. 244 244 This is only useful for FIEMAP. 245 245 246 - * **IOMAP_F_PRIVATE**: Starting with this value, the upper bits can 247 - be set by the filesystem for its own purposes. 246 + * **IOMAP_F_BOUNDARY**: This indicates I/O and its completion must not be 247 + merged with any other I/O or completion. Filesystems must use this when 248 + submitting I/O to devices that cannot handle I/O crossing certain LBAs 249 + (e.g. ZNS devices). This flag applies only to buffered I/O writeback; all 250 + other functions ignore it. 251 + 252 + * **IOMAP_F_PRIVATE**: This flag is reserved for filesystem private use. 248 253 249 254 * **IOMAP_F_ANON_WRITE**: Indicates that (write) I/O does not have a target 250 255 block assigned to it yet and the file system will do that in the bio 251 256 submission handler, splitting the I/O as needed. 257 + 258 + * **IOMAP_F_ATOMIC_BIO**: This indicates write I/O must be submitted with the 259 + ``REQ_ATOMIC`` flag set in the bio. Filesystems need to set this flag to 260 + inform iomap that the write I/O operation requires torn-write protection 261 + based on HW-offload mechanism. They must also ensure that mapping updates 262 + upon the completion of the I/O must be performed in a single metadata 263 + update. 252 264 253 265 These flags can be set by iomap itself during file operations. 254 266 The filesystem should supply an ``->iomap_end`` function if it needs
+58 -42
fs/iomap/buffered-io.c
··· 679 679 return submit_bio_wait(&bio); 680 680 } 681 681 682 - static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, 683 - size_t len, struct folio *folio) 682 + static int __iomap_write_begin(const struct iomap_iter *iter, size_t len, 683 + struct folio *folio) 684 684 { 685 685 const struct iomap *srcmap = iomap_iter_srcmap(iter); 686 686 struct iomap_folio_state *ifs; 687 + loff_t pos = iter->pos; 687 688 loff_t block_size = i_blocksize(iter->inode); 688 689 loff_t block_start = round_down(pos, block_size); 689 690 loff_t block_end = round_up(pos + len, block_size); ··· 742 741 return 0; 743 742 } 744 743 745 - static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos, 746 - size_t len) 744 + static struct folio *__iomap_get_folio(struct iomap_iter *iter, size_t len) 747 745 { 748 746 const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; 747 + loff_t pos = iter->pos; 748 + 749 + if (!mapping_large_folio_support(iter->inode->i_mapping)) 750 + len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); 749 751 750 752 if (folio_ops && folio_ops->get_folio) 751 753 return folio_ops->get_folio(iter, pos, len); ··· 756 752 return iomap_get_folio(iter, pos, len); 757 753 } 758 754 759 - static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret, 755 + static void __iomap_put_folio(struct iomap_iter *iter, size_t ret, 760 756 struct folio *folio) 761 757 { 762 758 const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; 759 + loff_t pos = iter->pos; 763 760 764 761 if (folio_ops && folio_ops->put_folio) { 765 762 folio_ops->put_folio(iter->inode, pos, ret, folio); ··· 768 763 folio_unlock(folio); 769 764 folio_put(folio); 770 765 } 766 + } 767 + 768 + /* trim pos and bytes to within a given folio */ 769 + static loff_t iomap_trim_folio_range(struct iomap_iter *iter, 770 + struct folio *folio, size_t *offset, u64 *bytes) 771 + { 772 + loff_t pos = iter->pos; 773 + size_t fsize = folio_size(folio); 774 + 775 + WARN_ON_ONCE(pos < folio_pos(folio)); 776 + WARN_ON_ONCE(pos >= folio_pos(folio) + fsize); 777 + 778 + *offset = offset_in_folio(folio, pos); 779 + *bytes = min(*bytes, fsize - *offset); 780 + 781 + return pos; 771 782 } 772 783 773 784 static int iomap_write_begin_inline(const struct iomap_iter *iter, ··· 795 774 return iomap_read_inline_data(iter, folio); 796 775 } 797 776 798 - static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, 799 - size_t len, struct folio **foliop) 777 + /* 778 + * Grab and prepare a folio for write based on iter state. Returns the folio, 779 + * offset, and length. Callers can optionally pass a max length *plen, 780 + * otherwise init to zero. 781 + */ 782 + static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop, 783 + size_t *poffset, u64 *plen) 800 784 { 801 785 const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; 802 786 const struct iomap *srcmap = iomap_iter_srcmap(iter); 787 + loff_t pos = iter->pos; 788 + u64 len = min_t(u64, SIZE_MAX, iomap_length(iter)); 803 789 struct folio *folio; 804 790 int status = 0; 805 791 792 + len = min_not_zero(len, *plen); 806 793 BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); 807 794 if (srcmap != &iter->iomap) 808 795 BUG_ON(pos + len > srcmap->offset + srcmap->length); ··· 818 789 if (fatal_signal_pending(current)) 819 790 return -EINTR; 820 791 821 - if (!mapping_large_folio_support(iter->inode->i_mapping)) 822 - len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); 823 - 824 - folio = __iomap_get_folio(iter, pos, len); 792 + folio = __iomap_get_folio(iter, len); 825 793 if (IS_ERR(folio)) 826 794 return PTR_ERR(folio); 827 795 ··· 842 816 } 843 817 } 844 818 845 - if (pos + len > folio_pos(folio) + folio_size(folio)) 846 - len = folio_pos(folio) + folio_size(folio) - pos; 819 + pos = iomap_trim_folio_range(iter, folio, poffset, &len); 847 820 848 821 if (srcmap->type == IOMAP_INLINE) 849 822 status = iomap_write_begin_inline(iter, folio); 850 823 else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) 851 824 status = __block_write_begin_int(folio, pos, len, NULL, srcmap); 852 825 else 853 - status = __iomap_write_begin(iter, pos, len, folio); 826 + status = __iomap_write_begin(iter, len, folio); 854 827 855 828 if (unlikely(status)) 856 829 goto out_unlock; 857 830 858 831 *foliop = folio; 832 + *plen = len; 859 833 return 0; 860 834 861 835 out_unlock: 862 - __iomap_put_folio(iter, pos, 0, folio); 836 + __iomap_put_folio(iter, 0, folio); 863 837 864 838 return status; 865 839 } ··· 909 883 * Returns true if all copied bytes have been written to the pagecache, 910 884 * otherwise return false. 911 885 */ 912 - static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, 913 - size_t copied, struct folio *folio) 886 + static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied, 887 + struct folio *folio) 914 888 { 915 889 const struct iomap *srcmap = iomap_iter_srcmap(iter); 890 + loff_t pos = iter->pos; 916 891 917 892 if (srcmap->type == IOMAP_INLINE) { 918 893 iomap_write_end_inline(iter, folio, pos, copied); ··· 944 917 struct folio *folio; 945 918 loff_t old_size; 946 919 size_t offset; /* Offset into folio */ 947 - size_t bytes; /* Bytes to write to folio */ 920 + u64 bytes; /* Bytes to write to folio */ 948 921 size_t copied; /* Bytes copied from user */ 949 922 u64 written; /* Bytes have been written */ 950 - loff_t pos = iter->pos; 923 + loff_t pos; 951 924 952 925 bytes = iov_iter_count(i); 953 926 retry: 954 - offset = pos & (chunk - 1); 927 + offset = iter->pos & (chunk - 1); 955 928 bytes = min(chunk - offset, bytes); 956 929 status = balance_dirty_pages_ratelimited_flags(mapping, 957 930 bdp_flags); ··· 976 949 break; 977 950 } 978 951 979 - status = iomap_write_begin(iter, pos, bytes, &folio); 952 + status = iomap_write_begin(iter, &folio, &offset, &bytes); 980 953 if (unlikely(status)) { 981 - iomap_write_failed(iter->inode, pos, bytes); 954 + iomap_write_failed(iter->inode, iter->pos, bytes); 982 955 break; 983 956 } 984 957 if (iter->iomap.flags & IOMAP_F_STALE) 985 958 break; 986 959 987 - offset = offset_in_folio(folio, pos); 988 - if (bytes > folio_size(folio) - offset) 989 - bytes = folio_size(folio) - offset; 960 + pos = iter->pos; 990 961 991 962 if (mapping_writably_mapped(mapping)) 992 963 flush_dcache_folio(folio); 993 964 994 965 copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); 995 - written = iomap_write_end(iter, pos, bytes, copied, folio) ? 966 + written = iomap_write_end(iter, bytes, copied, folio) ? 996 967 copied : 0; 997 968 998 969 /* ··· 1005 980 i_size_write(iter->inode, pos + written); 1006 981 iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; 1007 982 } 1008 - __iomap_put_folio(iter, pos, written, folio); 983 + __iomap_put_folio(iter, written, folio); 1009 984 1010 985 if (old_size < pos) 1011 986 pagecache_isize_extended(iter->inode, old_size, pos); ··· 1301 1276 do { 1302 1277 struct folio *folio; 1303 1278 size_t offset; 1304 - loff_t pos = iter->pos; 1305 1279 bool ret; 1306 1280 1307 1281 bytes = min_t(u64, SIZE_MAX, bytes); 1308 - status = iomap_write_begin(iter, pos, bytes, &folio); 1282 + status = iomap_write_begin(iter, &folio, &offset, &bytes); 1309 1283 if (unlikely(status)) 1310 1284 return status; 1311 1285 if (iomap->flags & IOMAP_F_STALE) 1312 1286 break; 1313 1287 1314 - offset = offset_in_folio(folio, pos); 1315 - if (bytes > folio_size(folio) - offset) 1316 - bytes = folio_size(folio) - offset; 1317 - 1318 - ret = iomap_write_end(iter, pos, bytes, bytes, folio); 1319 - __iomap_put_folio(iter, pos, bytes, folio); 1288 + ret = iomap_write_end(iter, bytes, bytes, folio); 1289 + __iomap_put_folio(iter, bytes, folio); 1320 1290 if (WARN_ON_ONCE(!ret)) 1321 1291 return -EIO; 1322 1292 ··· 1371 1351 do { 1372 1352 struct folio *folio; 1373 1353 size_t offset; 1374 - loff_t pos = iter->pos; 1375 1354 bool ret; 1376 1355 1377 1356 bytes = min_t(u64, SIZE_MAX, bytes); 1378 - status = iomap_write_begin(iter, pos, bytes, &folio); 1357 + status = iomap_write_begin(iter, &folio, &offset, &bytes); 1379 1358 if (status) 1380 1359 return status; 1381 1360 if (iter->iomap.flags & IOMAP_F_STALE) ··· 1382 1363 1383 1364 /* warn about zeroing folios beyond eof that won't write back */ 1384 1365 WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size); 1385 - offset = offset_in_folio(folio, pos); 1386 - if (bytes > folio_size(folio) - offset) 1387 - bytes = folio_size(folio) - offset; 1388 1366 1389 1367 folio_zero_range(folio, offset, bytes); 1390 1368 folio_mark_accessed(folio); 1391 1369 1392 - ret = iomap_write_end(iter, pos, bytes, bytes, folio); 1393 - __iomap_put_folio(iter, pos, bytes, folio); 1370 + ret = iomap_write_end(iter, bytes, bytes, folio); 1371 + __iomap_put_folio(iter, bytes, folio); 1394 1372 if (WARN_ON_ONCE(!ret)) 1395 1373 return -EIO; 1396 1374
+21 -6
fs/iomap/trace.h
··· 99 99 { IOMAP_FAULT, "FAULT" }, \ 100 100 { IOMAP_DIRECT, "DIRECT" }, \ 101 101 { IOMAP_NOWAIT, "NOWAIT" }, \ 102 - { IOMAP_ATOMIC, "ATOMIC" } 102 + { IOMAP_OVERWRITE_ONLY, "OVERWRITE_ONLY" }, \ 103 + { IOMAP_UNSHARE, "UNSHARE" }, \ 104 + { IOMAP_DAX, "DAX" }, \ 105 + { IOMAP_ATOMIC, "ATOMIC" }, \ 106 + { IOMAP_DONTCACHE, "DONTCACHE" } 103 107 104 108 #define IOMAP_F_FLAGS_STRINGS \ 105 109 { IOMAP_F_NEW, "NEW" }, \ ··· 111 107 { IOMAP_F_SHARED, "SHARED" }, \ 112 108 { IOMAP_F_MERGED, "MERGED" }, \ 113 109 { IOMAP_F_BUFFER_HEAD, "BH" }, \ 114 - { IOMAP_F_SIZE_CHANGED, "SIZE_CHANGED" } 110 + { IOMAP_F_XATTR, "XATTR" }, \ 111 + { IOMAP_F_BOUNDARY, "BOUNDARY" }, \ 112 + { IOMAP_F_ANON_WRITE, "ANON_WRITE" }, \ 113 + { IOMAP_F_ATOMIC_BIO, "ATOMIC_BIO" }, \ 114 + { IOMAP_F_PRIVATE, "PRIVATE" }, \ 115 + { IOMAP_F_SIZE_CHANGED, "SIZE_CHANGED" }, \ 116 + { IOMAP_F_STALE, "STALE" } 117 + 115 118 116 119 #define IOMAP_DIO_STRINGS \ 117 120 {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \ ··· 149 138 __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0; 150 139 ), 151 140 TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr 0x%llx offset 0x%llx " 152 - "length 0x%llx type %s flags %s", 141 + "length 0x%llx type %s (0x%x) flags %s (0x%x)", 153 142 MAJOR(__entry->dev), MINOR(__entry->dev), 154 143 __entry->ino, 155 144 MAJOR(__entry->bdev), MINOR(__entry->bdev), ··· 157 146 __entry->offset, 158 147 __entry->length, 159 148 __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS), 160 - __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS)) 149 + __entry->type, 150 + __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS), 151 + __entry->flags) 161 152 ) 162 153 163 154 #define DEFINE_IOMAP_EVENT(name) \ ··· 198 185 __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0; 199 186 ), 200 187 TP_printk("dev %d:%d ino 0x%llx bdev %d:%d pos 0x%llx dirty len 0x%llx " 201 - "addr 0x%llx offset 0x%llx length 0x%llx type %s flags %s", 188 + "addr 0x%llx offset 0x%llx length 0x%llx type %s (0x%x) flags %s (0x%x)", 202 189 MAJOR(__entry->dev), MINOR(__entry->dev), 203 190 __entry->ino, 204 191 MAJOR(__entry->bdev), MINOR(__entry->bdev), ··· 208 195 __entry->offset, 209 196 __entry->length, 210 197 __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS), 211 - __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS)) 198 + __entry->type, 199 + __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS), 200 + __entry->flags) 212 201 ); 213 202 214 203 TRACE_EVENT(iomap_iter,