Merge tag 'vfs-6.19-rc1.iomap' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

+46 -4

Documentation/filesystems/iomap/operations.rst

··· 135 135 136 136 * ``IOCB_DONTCACHE``: Turns on ``IOMAP_DONTCACHE``. 137 137 138 + ``struct iomap_read_ops`` 139 + -------------------------- 140 + 141 + .. code-block:: c 142 + 143 + struct iomap_read_ops { 144 + int (*read_folio_range)(const struct iomap_iter *iter, 145 + struct iomap_read_folio_ctx *ctx, size_t len); 146 + void (*submit_read)(struct iomap_read_folio_ctx *ctx); 147 + }; 148 + 149 + iomap calls these functions: 150 + 151 + - ``read_folio_range``: Called to read in the range. This must be provided 152 + by the caller. If this succeeds, iomap_finish_folio_read() must be called 153 + after the range is read in, regardless of whether the read succeeded or 154 + failed. 155 + 156 + - ``submit_read``: Submit any pending read requests. This function is 157 + optional. 158 + 138 159 Internal per-Folio State 139 160 ------------------------ 140 161 ··· 202 181 The ``flags`` argument to ``->iomap_begin`` will be set to zero. 203 182 The pagecache takes whatever locks it needs before calling the 204 183 filesystem. 184 + 185 + Both ``iomap_readahead`` and ``iomap_read_folio`` pass in a ``struct 186 + iomap_read_folio_ctx``: 187 + 188 + .. code-block:: c 189 + 190 + struct iomap_read_folio_ctx { 191 + const struct iomap_read_ops *ops; 192 + struct folio *cur_folio; 193 + struct readahead_control *rac; 194 + void *read_ctx; 195 + }; 196 + 197 + ``iomap_readahead`` must set: 198 + * ``ops->read_folio_range()`` and ``rac`` 199 + 200 + ``iomap_read_folio`` must set: 201 + * ``ops->read_folio_range()`` and ``cur_folio`` 202 + 203 + ``ops->submit_read()`` and ``read_ctx`` are optional. ``read_ctx`` is used to 204 + pass in any custom data the caller needs accessible in the ops callbacks for 205 + fulfilling reads. 205 206 206 207 Buffered Writes 207 208 --------------- ··· 360 317 delalloc reservations to avoid having delalloc reservations for 361 318 clean pagecache. 362 319 This function must be supplied by the filesystem. 320 + If this succeeds, iomap_finish_folio_write() must be called once writeback 321 + completes for the range, regardless of whether the writeback succeeded or 322 + failed. 363 323 364 324 - ``writeback_submit``: Submit the previous built writeback context. 365 325 Block based file systems should use the iomap_ioend_writeback_submit ··· 489 443 interrupt. 490 444 Only meaningful for asynchronous I/O, and only if the entire I/O can 491 445 be issued as a single ``struct bio``. 492 - 493 - * ``IOCB_DIO_CALLER_COMP``: Try to run I/O completion from the caller's 494 - process context. 495 - See ``linux/fs.h`` for more details. 496 446 497 447 Filesystems should call ``iomap_dio_rw`` from ``->read_iter`` and 498 448 ``->write_iter``, and set ``FMODE_CAN_ODIRECT`` in the ``->open``

+3 -2

block/fops.c

··· 540 540 #else /* CONFIG_BUFFER_HEAD */ 541 541 static int blkdev_read_folio(struct file *file, struct folio *folio) 542 542 { 543 - return iomap_read_folio(folio, &blkdev_iomap_ops); 543 + iomap_bio_read_folio(folio, &blkdev_iomap_ops); 544 + return 0; 544 545 } 545 546 546 547 static void blkdev_readahead(struct readahead_control *rac) 547 548 { 548 - iomap_readahead(rac, &blkdev_iomap_ops); 549 + iomap_bio_readahead(rac, &blkdev_iomap_ops); 549 550 } 550 551 551 552 static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc,

-6

fs/backing-file.c

··· 227 227 !(file->f_mode & FMODE_CAN_ODIRECT)) 228 228 return -EINVAL; 229 229 230 - /* 231 - * Stacked filesystems don't support deferred completions, don't copy 232 - * this property in case it is set by the issuer. 233 - */ 234 - flags &= ~IOCB_DIO_CALLER_COMP; 235 - 236 230 old_cred = override_creds(ctx->cred); 237 231 if (is_sync_kiocb(iocb)) { 238 232 rwf_t rwf = iocb_to_rw_flags(flags);

+12 -18

fs/dax.c

··· 1507 1507 1508 1508 /* already zeroed? we're done. */ 1509 1509 if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) 1510 - return iomap_iter_advance(iter, &length); 1510 + return iomap_iter_advance(iter, length); 1511 1511 1512 1512 /* 1513 1513 * invalidate the pages whose sharing state is to be changed ··· 1536 1536 if (ret < 0) 1537 1537 return ret; 1538 1538 1539 - ret = iomap_iter_advance(iter, &length); 1539 + ret = iomap_iter_advance(iter, length); 1540 1540 if (ret) 1541 1541 return ret; 1542 - } while (length > 0); 1542 + } while ((length = iomap_length(iter)) > 0); 1543 1543 1544 1544 if (did_zero) 1545 1545 *did_zero = true; ··· 1597 1597 1598 1598 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) { 1599 1599 done = iov_iter_zero(min(length, end - pos), iter); 1600 - return iomap_iter_advance(iomi, &done); 1600 + return iomap_iter_advance(iomi, done); 1601 1601 } 1602 1602 } 1603 1603 ··· 1681 1681 xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr, 1682 1682 map_len, iter); 1683 1683 1684 - length = xfer; 1685 - ret = iomap_iter_advance(iomi, &length); 1684 + ret = iomap_iter_advance(iomi, xfer); 1686 1685 if (!ret && xfer == 0) 1687 1686 ret = -EFAULT; 1688 1687 if (xfer < map_len) 1689 1688 break; 1689 + length = iomap_length(iomi); 1690 1690 } 1691 1691 dax_read_unlock(id); 1692 1692 ··· 1919 1919 ret |= VM_FAULT_MAJOR; 1920 1920 } 1921 1921 1922 - if (!(ret & VM_FAULT_ERROR)) { 1923 - u64 length = PAGE_SIZE; 1924 - iter.status = iomap_iter_advance(&iter, &length); 1925 - } 1922 + if (!(ret & VM_FAULT_ERROR)) 1923 + iter.status = iomap_iter_advance(&iter, PAGE_SIZE); 1926 1924 } 1927 1925 1928 1926 if (iomap_errp) ··· 2032 2034 continue; /* actually breaks out of the loop */ 2033 2035 2034 2036 ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true); 2035 - if (ret != VM_FAULT_FALLBACK) { 2036 - u64 length = PMD_SIZE; 2037 - iter.status = iomap_iter_advance(&iter, &length); 2038 - } 2037 + if (ret != VM_FAULT_FALLBACK) 2038 + iter.status = iomap_iter_advance(&iter, PMD_SIZE); 2039 2039 } 2040 2040 2041 2041 unlock_entry: ··· 2159 2163 const struct iomap *smap = &it_src->iomap; 2160 2164 const struct iomap *dmap = &it_dest->iomap; 2161 2165 loff_t pos1 = it_src->pos, pos2 = it_dest->pos; 2162 - u64 dest_len; 2163 2166 void *saddr, *daddr; 2164 2167 int id, ret; 2165 2168 ··· 2191 2196 dax_read_unlock(id); 2192 2197 2193 2198 advance: 2194 - dest_len = len; 2195 - ret = iomap_iter_advance(it_src, &len); 2199 + ret = iomap_iter_advance(it_src, len); 2196 2200 if (!ret) 2197 - ret = iomap_iter_advance(it_dest, &dest_len); 2201 + ret = iomap_iter_advance(it_dest, len); 2198 2202 return ret; 2199 2203 2200 2204 out_unlock:

+3 -2

fs/erofs/data.c

··· 371 371 { 372 372 trace_erofs_read_folio(folio, true); 373 373 374 - return iomap_read_folio(folio, &erofs_iomap_ops); 374 + iomap_bio_read_folio(folio, &erofs_iomap_ops); 375 + return 0; 375 376 } 376 377 377 378 static void erofs_readahead(struct readahead_control *rac) ··· 380 379 trace_erofs_readahead(rac->mapping->host, readahead_index(rac), 381 380 readahead_count(rac), true); 382 381 383 - return iomap_readahead(rac, &erofs_iomap_ops); 382 + iomap_bio_readahead(rac, &erofs_iomap_ops); 384 383 } 385 384 386 385 static sector_t erofs_bmap(struct address_space *mapping, sector_t block)

+1 -1

fs/fuse/dir.c

··· 1192 1192 if (attr->blksize != 0) 1193 1193 blkbits = ilog2(attr->blksize); 1194 1194 else 1195 - blkbits = fc->blkbits; 1195 + blkbits = inode->i_sb->s_blocksize_bits; 1196 1196 1197 1197 stat->blksize = 1 << blkbits; 1198 1198 }

+170 -116

fs/fuse/file.c

··· 834 834 return 0; 835 835 } 836 836 837 + static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length, 838 + unsigned int flags, struct iomap *iomap, 839 + struct iomap *srcmap) 840 + { 841 + iomap->type = IOMAP_MAPPED; 842 + iomap->length = length; 843 + iomap->offset = offset; 844 + return 0; 845 + } 846 + 847 + static const struct iomap_ops fuse_iomap_ops = { 848 + .iomap_begin = fuse_iomap_begin, 849 + }; 850 + 851 + struct fuse_fill_read_data { 852 + struct file *file; 853 + 854 + /* Fields below are used if sending the read request asynchronously */ 855 + struct fuse_conn *fc; 856 + struct fuse_io_args *ia; 857 + unsigned int nr_bytes; 858 + }; 859 + 860 + /* forward declarations */ 861 + static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, 862 + unsigned len, struct fuse_args_pages *ap, 863 + unsigned cur_bytes, bool write); 864 + static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, 865 + unsigned int count, bool async); 866 + 867 + static int fuse_handle_readahead(struct folio *folio, 868 + struct readahead_control *rac, 869 + struct fuse_fill_read_data *data, loff_t pos, 870 + size_t len) 871 + { 872 + struct fuse_io_args *ia = data->ia; 873 + size_t off = offset_in_folio(folio, pos); 874 + struct fuse_conn *fc = data->fc; 875 + struct fuse_args_pages *ap; 876 + unsigned int nr_pages; 877 + 878 + if (ia && fuse_folios_need_send(fc, pos, len, &ia->ap, data->nr_bytes, 879 + false)) { 880 + fuse_send_readpages(ia, data->file, data->nr_bytes, 881 + fc->async_read); 882 + data->nr_bytes = 0; 883 + data->ia = NULL; 884 + ia = NULL; 885 + } 886 + if (!ia) { 887 + if (fc->num_background >= fc->congestion_threshold && 888 + rac->ra->async_size >= readahead_count(rac)) 889 + /* 890 + * Congested and only async pages left, so skip the 891 + * rest. 892 + */ 893 + return -EAGAIN; 894 + 895 + nr_pages = min(fc->max_pages, readahead_count(rac)); 896 + data->ia = fuse_io_alloc(NULL, nr_pages); 897 + if (!data->ia) 898 + return -ENOMEM; 899 + ia = data->ia; 900 + } 901 + folio_get(folio); 902 + ap = &ia->ap; 903 + ap->folios[ap->num_folios] = folio; 904 + ap->descs[ap->num_folios].offset = off; 905 + ap->descs[ap->num_folios].length = len; 906 + data->nr_bytes += len; 907 + ap->num_folios++; 908 + 909 + return 0; 910 + } 911 + 912 + static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, 913 + struct iomap_read_folio_ctx *ctx, 914 + size_t len) 915 + { 916 + struct fuse_fill_read_data *data = ctx->read_ctx; 917 + struct folio *folio = ctx->cur_folio; 918 + loff_t pos = iter->pos; 919 + size_t off = offset_in_folio(folio, pos); 920 + struct file *file = data->file; 921 + int ret; 922 + 923 + if (ctx->rac) { 924 + ret = fuse_handle_readahead(folio, ctx->rac, data, pos, len); 925 + } else { 926 + /* 927 + * for non-readahead read requests, do reads synchronously 928 + * since it's not guaranteed that the server can handle 929 + * out-of-order reads 930 + */ 931 + ret = fuse_do_readfolio(file, folio, off, len); 932 + if (!ret) 933 + iomap_finish_folio_read(folio, off, len, ret); 934 + } 935 + return ret; 936 + } 937 + 938 + static void fuse_iomap_read_submit(struct iomap_read_folio_ctx *ctx) 939 + { 940 + struct fuse_fill_read_data *data = ctx->read_ctx; 941 + 942 + if (data->ia) 943 + fuse_send_readpages(data->ia, data->file, data->nr_bytes, 944 + data->fc->async_read); 945 + } 946 + 947 + static const struct iomap_read_ops fuse_iomap_read_ops = { 948 + .read_folio_range = fuse_iomap_read_folio_range_async, 949 + .submit_read = fuse_iomap_read_submit, 950 + }; 951 + 837 952 static int fuse_read_folio(struct file *file, struct folio *folio) 838 953 { 839 954 struct inode *inode = folio->mapping->host; 840 - int err; 955 + struct fuse_fill_read_data data = { 956 + .file = file, 957 + }; 958 + struct iomap_read_folio_ctx ctx = { 959 + .cur_folio = folio, 960 + .ops = &fuse_iomap_read_ops, 961 + .read_ctx = &data, 841 962 842 - err = -EIO; 843 - if (fuse_is_bad(inode)) 844 - goto out; 963 + }; 845 964 846 - err = fuse_do_readfolio(file, folio, 0, folio_size(folio)); 847 - if (!err) 848 - folio_mark_uptodate(folio); 965 + if (fuse_is_bad(inode)) { 966 + folio_unlock(folio); 967 + return -EIO; 968 + } 849 969 970 + iomap_read_folio(&fuse_iomap_ops, &ctx); 850 971 fuse_invalidate_atime(inode); 851 - out: 852 - folio_unlock(folio); 853 - return err; 972 + return 0; 854 973 } 855 974 856 975 static int fuse_iomap_read_folio_range(const struct iomap_iter *iter, ··· 1006 887 fuse_invalidate_atime(inode); 1007 888 1008 889 for (i = 0; i < ap->num_folios; i++) { 1009 - folio_end_read(ap->folios[i], !err); 890 + iomap_finish_folio_read(ap->folios[i], ap->descs[i].offset, 891 + ap->descs[i].length, err); 1010 892 folio_put(ap->folios[i]); 1011 893 } 1012 894 if (ia->ff) ··· 1017 897 } 1018 898 1019 899 static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, 1020 - unsigned int count) 900 + unsigned int count, bool async) 1021 901 { 1022 902 struct fuse_file *ff = file->private_data; 1023 903 struct fuse_mount *fm = ff->fm; ··· 1039 919 1040 920 fuse_read_args_fill(ia, file, pos, count, FUSE_READ); 1041 921 ia->read.attr_ver = fuse_get_attr_version(fm->fc); 1042 - if (fm->fc->async_read) { 922 + if (async) { 1043 923 ia->ff = fuse_file_get(ff); 1044 924 ap->args.end = fuse_readpages_end; 1045 925 err = fuse_simple_background(fm, &ap->args, GFP_KERNEL); ··· 1056 936 { 1057 937 struct inode *inode = rac->mapping->host; 1058 938 struct fuse_conn *fc = get_fuse_conn(inode); 1059 - unsigned int max_pages, nr_pages; 1060 - struct folio *folio = NULL; 939 + struct fuse_fill_read_data data = { 940 + .file = rac->file, 941 + .fc = fc, 942 + }; 943 + struct iomap_read_folio_ctx ctx = { 944 + .ops = &fuse_iomap_read_ops, 945 + .rac = rac, 946 + .read_ctx = &data 947 + }; 1061 948 1062 949 if (fuse_is_bad(inode)) 1063 950 return; 1064 951 1065 - max_pages = min_t(unsigned int, fc->max_pages, 1066 - fc->max_read / PAGE_SIZE); 1067 - 1068 - /* 1069 - * This is only accurate the first time through, since readahead_folio() 1070 - * doesn't update readahead_count() from the previous folio until the 1071 - * next call. Grab nr_pages here so we know how many pages we're going 1072 - * to have to process. This means that we will exit here with 1073 - * readahead_count() == folio_nr_pages(last_folio), but we will have 1074 - * consumed all of the folios, and read_pages() will call 1075 - * readahead_folio() again which will clean up the rac. 1076 - */ 1077 - nr_pages = readahead_count(rac); 1078 - 1079 - while (nr_pages) { 1080 - struct fuse_io_args *ia; 1081 - struct fuse_args_pages *ap; 1082 - unsigned cur_pages = min(max_pages, nr_pages); 1083 - unsigned int pages = 0; 1084 - 1085 - if (fc->num_background >= fc->congestion_threshold && 1086 - rac->ra->async_size >= readahead_count(rac)) 1087 - /* 1088 - * Congested and only async pages left, so skip the 1089 - * rest. 1090 - */ 1091 - break; 1092 - 1093 - ia = fuse_io_alloc(NULL, cur_pages); 1094 - if (!ia) 1095 - break; 1096 - ap = &ia->ap; 1097 - 1098 - while (pages < cur_pages) { 1099 - unsigned int folio_pages; 1100 - 1101 - /* 1102 - * This returns a folio with a ref held on it. 1103 - * The ref needs to be held until the request is 1104 - * completed, since the splice case (see 1105 - * fuse_try_move_page()) drops the ref after it's 1106 - * replaced in the page cache. 1107 - */ 1108 - if (!folio) 1109 - folio = __readahead_folio(rac); 1110 - 1111 - folio_pages = folio_nr_pages(folio); 1112 - if (folio_pages > cur_pages - pages) { 1113 - /* 1114 - * Large folios belonging to fuse will never 1115 - * have more pages than max_pages. 1116 - */ 1117 - WARN_ON(!pages); 1118 - break; 1119 - } 1120 - 1121 - ap->folios[ap->num_folios] = folio; 1122 - ap->descs[ap->num_folios].length = folio_size(folio); 1123 - ap->num_folios++; 1124 - pages += folio_pages; 1125 - folio = NULL; 1126 - } 1127 - fuse_send_readpages(ia, rac->file, pages << PAGE_SHIFT); 1128 - nr_pages -= pages; 1129 - } 1130 - if (folio) { 1131 - folio_end_read(folio, false); 1132 - folio_put(folio); 1133 - } 952 + iomap_readahead(&fuse_iomap_ops, &ctx); 1134 953 } 1135 954 1136 955 static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) ··· 1454 1395 1455 1396 static const struct iomap_write_ops fuse_iomap_write_ops = { 1456 1397 .read_folio_range = fuse_iomap_read_folio_range, 1457 - }; 1458 - 1459 - static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length, 1460 - unsigned int flags, struct iomap *iomap, 1461 - struct iomap *srcmap) 1462 - { 1463 - iomap->type = IOMAP_MAPPED; 1464 - iomap->length = length; 1465 - iomap->offset = offset; 1466 - return 0; 1467 - } 1468 - 1469 - static const struct iomap_ops fuse_iomap_ops = { 1470 - .iomap_begin = fuse_iomap_begin, 1471 1398 }; 1472 1399 1473 1400 static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) ··· 1879 1834 * scope of the fi->lock alleviates xarray lock 1880 1835 * contention and noticeably improves performance. 1881 1836 */ 1882 - iomap_finish_folio_write(inode, ap->folios[i], 1); 1837 + iomap_finish_folio_write(inode, ap->folios[i], 1838 + ap->descs[i].length); 1883 1839 1884 1840 wake_up(&fi->page_waitq); 1885 1841 } ··· 2093 2047 struct fuse_file *ff; 2094 2048 unsigned int max_folios; 2095 2049 /* 2096 - * nr_bytes won't overflow since fuse_writepage_need_send() caps 2050 + * nr_bytes won't overflow since fuse_folios_need_send() caps 2097 2051 * wb requests to never exceed fc->max_pages (which has an upper bound 2098 2052 * of U16_MAX). 2099 2053 */ ··· 2138 2092 spin_unlock(&fi->lock); 2139 2093 } 2140 2094 2141 - static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, 2142 - unsigned len, struct fuse_args_pages *ap, 2143 - struct fuse_fill_wb_data *data) 2095 + static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, 2096 + unsigned len, struct fuse_args_pages *ap, 2097 + unsigned cur_bytes, bool write) 2144 2098 { 2145 2099 struct folio *prev_folio; 2146 2100 struct fuse_folio_desc prev_desc; 2147 - unsigned bytes = data->nr_bytes + len; 2101 + unsigned bytes = cur_bytes + len; 2148 2102 loff_t prev_pos; 2103 + size_t max_bytes = write ? fc->max_write : fc->max_read; 2149 2104 2150 2105 WARN_ON(!ap->num_folios); 2151 2106 ··· 2154 2107 if ((bytes + PAGE_SIZE - 1) >> PAGE_SHIFT > fc->max_pages) 2155 2108 return true; 2156 2109 2157 - /* Reached max write bytes */ 2158 - if (bytes > fc->max_write) 2110 + if (bytes > max_bytes) 2159 2111 return true; 2160 2112 2161 2113 /* Discontinuity */ ··· 2162 2116 prev_desc = ap->descs[ap->num_folios - 1]; 2163 2117 prev_pos = folio_pos(prev_folio) + prev_desc.offset + prev_desc.length; 2164 2118 if (prev_pos != pos) 2165 - return true; 2166 - 2167 - /* Need to grow the pages array? If so, did the expansion fail? */ 2168 - if (ap->num_folios == data->max_folios && 2169 - !fuse_pages_realloc(data, fc->max_pages)) 2170 2119 return true; 2171 2120 2172 2121 return false; ··· 2187 2146 return -EIO; 2188 2147 } 2189 2148 2190 - if (wpa && fuse_writepage_need_send(fc, pos, len, ap, data)) { 2191 - fuse_writepages_send(inode, data); 2192 - data->wpa = NULL; 2193 - data->nr_bytes = 0; 2149 + if (wpa) { 2150 + bool send = fuse_folios_need_send(fc, pos, len, ap, 2151 + data->nr_bytes, true); 2152 + 2153 + if (!send) { 2154 + /* 2155 + * Need to grow the pages array? If so, did the 2156 + * expansion fail? 2157 + */ 2158 + send = (ap->num_folios == data->max_folios) && 2159 + !fuse_pages_realloc(data, fc->max_pages); 2160 + } 2161 + 2162 + if (send) { 2163 + fuse_writepages_send(inode, data); 2164 + data->wpa = NULL; 2165 + data->nr_bytes = 0; 2166 + } 2194 2167 } 2195 2168 2196 2169 if (data->wpa == NULL) { ··· 2216 2161 ap = &wpa->ia.ap; 2217 2162 } 2218 2163 2219 - iomap_start_folio_write(inode, folio, 1); 2220 2164 fuse_writepage_args_page_fill(wpa, folio, ap->num_folios, 2221 2165 offset, len); 2222 2166 data->nr_bytes += len;

-8

fs/fuse/fuse_i.h

··· 981 981 /* Request timeout (in jiffies). 0 = no timeout */ 982 982 unsigned int req_timeout; 983 983 } timeout; 984 - 985 - /* 986 - * This is a workaround until fuse uses iomap for reads. 987 - * For fuseblk servers, this represents the blocksize passed in at 988 - * mount time and for regular fuse servers, this is equivalent to 989 - * inode->i_blkbits. 990 - */ 991 - u8 blkbits; 992 984 }; 993 985 994 986 /*

+1 -12

fs/fuse/inode.c

··· 291 291 if (attr->blksize) 292 292 fi->cached_i_blkbits = ilog2(attr->blksize); 293 293 else 294 - fi->cached_i_blkbits = fc->blkbits; 294 + fi->cached_i_blkbits = inode->i_sb->s_blocksize_bits; 295 295 296 296 /* 297 297 * Don't set the sticky bit in i_mode, unless we want the VFS ··· 1838 1838 err = -EINVAL; 1839 1839 if (!sb_set_blocksize(sb, ctx->blksize)) 1840 1840 goto err; 1841 - /* 1842 - * This is a workaround until fuse hooks into iomap for reads. 1843 - * Use PAGE_SIZE for the blocksize else if the writeback cache 1844 - * is enabled, buffered writes go through iomap and a read may 1845 - * overwrite partially written data if blocksize < PAGE_SIZE 1846 - */ 1847 - fc->blkbits = sb->s_blocksize_bits; 1848 - if (ctx->blksize != PAGE_SIZE && 1849 - !sb_set_blocksize(sb, PAGE_SIZE)) 1850 - goto err; 1851 1841 #endif 1852 1842 fc->sync_fs = 1; 1853 1843 } else { 1854 1844 sb->s_blocksize = PAGE_SIZE; 1855 1845 sb->s_blocksize_bits = PAGE_SHIFT; 1856 - fc->blkbits = sb->s_blocksize_bits; 1857 1846 } 1858 1847 1859 1848 sb->s_subtype = ctx->subtype;

+3 -3

fs/gfs2/aops.c

··· 424 424 struct inode *inode = folio->mapping->host; 425 425 struct gfs2_inode *ip = GFS2_I(inode); 426 426 struct gfs2_sbd *sdp = GFS2_SB(inode); 427 - int error; 427 + int error = 0; 428 428 429 429 if (!gfs2_is_jdata(ip) || 430 430 (i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) { 431 - error = iomap_read_folio(folio, &gfs2_iomap_ops); 431 + iomap_bio_read_folio(folio, &gfs2_iomap_ops); 432 432 } else if (gfs2_is_stuffed(ip)) { 433 433 error = stuffed_read_folio(ip, folio); 434 434 } else { ··· 503 503 else if (gfs2_is_jdata(ip)) 504 504 mpage_readahead(rac, gfs2_block_map); 505 505 else 506 - iomap_readahead(rac, &gfs2_iomap_ops); 506 + iomap_bio_readahead(rac, &gfs2_iomap_ops); 507 507 } 508 508 509 509 /**

+2 -1

fs/iomap/Makefile

··· 14 14 iomap-$(CONFIG_BLOCK) += direct-io.o \ 15 15 ioend.o \ 16 16 fiemap.o \ 17 - seek.o 17 + seek.o \ 18 + bio.o 18 19 iomap-$(CONFIG_SWAP) += swapfile.o

+88

fs/iomap/bio.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2010 Red Hat, Inc. 4 + * Copyright (C) 2016-2023 Christoph Hellwig. 5 + */ 6 + #include <linux/iomap.h> 7 + #include <linux/pagemap.h> 8 + #include "internal.h" 9 + #include "trace.h" 10 + 11 + static void iomap_read_end_io(struct bio *bio) 12 + { 13 + int error = blk_status_to_errno(bio->bi_status); 14 + struct folio_iter fi; 15 + 16 + bio_for_each_folio_all(fi, bio) 17 + iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error); 18 + bio_put(bio); 19 + } 20 + 21 + static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx) 22 + { 23 + struct bio *bio = ctx->read_ctx; 24 + 25 + if (bio) 26 + submit_bio(bio); 27 + } 28 + 29 + static int iomap_bio_read_folio_range(const struct iomap_iter *iter, 30 + struct iomap_read_folio_ctx *ctx, size_t plen) 31 + { 32 + struct folio *folio = ctx->cur_folio; 33 + const struct iomap *iomap = &iter->iomap; 34 + loff_t pos = iter->pos; 35 + size_t poff = offset_in_folio(folio, pos); 36 + loff_t length = iomap_length(iter); 37 + sector_t sector; 38 + struct bio *bio = ctx->read_ctx; 39 + 40 + sector = iomap_sector(iomap, pos); 41 + if (!bio || bio_end_sector(bio) != sector || 42 + !bio_add_folio(bio, folio, plen, poff)) { 43 + gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); 44 + gfp_t orig_gfp = gfp; 45 + unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); 46 + 47 + if (bio) 48 + submit_bio(bio); 49 + 50 + if (ctx->rac) /* same as readahead_gfp_mask */ 51 + gfp |= __GFP_NORETRY | __GFP_NOWARN; 52 + bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs), REQ_OP_READ, 53 + gfp); 54 + /* 55 + * If the bio_alloc fails, try it again for a single page to 56 + * avoid having to deal with partial page reads. This emulates 57 + * what do_mpage_read_folio does. 58 + */ 59 + if (!bio) 60 + bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ, orig_gfp); 61 + if (ctx->rac) 62 + bio->bi_opf |= REQ_RAHEAD; 63 + bio->bi_iter.bi_sector = sector; 64 + bio->bi_end_io = iomap_read_end_io; 65 + bio_add_folio_nofail(bio, folio, plen, poff); 66 + ctx->read_ctx = bio; 67 + } 68 + return 0; 69 + } 70 + 71 + const struct iomap_read_ops iomap_bio_read_ops = { 72 + .read_folio_range = iomap_bio_read_folio_range, 73 + .submit_read = iomap_bio_submit_read, 74 + }; 75 + EXPORT_SYMBOL_GPL(iomap_bio_read_ops); 76 + 77 + int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter, 78 + struct folio *folio, loff_t pos, size_t len) 79 + { 80 + const struct iomap *srcmap = iomap_iter_srcmap(iter); 81 + struct bio_vec bvec; 82 + struct bio bio; 83 + 84 + bio_init(&bio, srcmap->bdev, &bvec, 1, REQ_OP_READ); 85 + bio.bi_iter.bi_sector = iomap_sector(srcmap, pos); 86 + bio_add_folio_nofail(&bio, folio, len, offset_in_folio(folio, pos)); 87 + return submit_bio_wait(&bio); 88 + }

+387 -253

fs/iomap/buffered-io.c

··· 8 8 #include <linux/writeback.h> 9 9 #include <linux/swap.h> 10 10 #include <linux/migrate.h> 11 + #include "internal.h" 11 12 #include "trace.h" 12 13 13 14 #include "../internal.h" ··· 38 37 return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio)); 39 38 } 40 39 41 - static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs, 42 - unsigned int block) 40 + /* 41 + * Find the next uptodate block in the folio. end_blk is inclusive. 42 + * If no uptodate block is found, this will return end_blk + 1. 43 + */ 44 + static unsigned ifs_next_uptodate_block(struct folio *folio, 45 + unsigned start_blk, unsigned end_blk) 43 46 { 44 - return test_bit(block, ifs->state); 47 + struct iomap_folio_state *ifs = folio->private; 48 + 49 + return find_next_bit(ifs->state, end_blk + 1, start_blk); 50 + } 51 + 52 + /* 53 + * Find the next non-uptodate block in the folio. end_blk is inclusive. 54 + * If no non-uptodate block is found, this will return end_blk + 1. 55 + */ 56 + static unsigned ifs_next_nonuptodate_block(struct folio *folio, 57 + unsigned start_blk, unsigned end_blk) 58 + { 59 + struct iomap_folio_state *ifs = folio->private; 60 + 61 + return find_next_zero_bit(ifs->state, end_blk + 1, start_blk); 45 62 } 46 63 47 64 static bool ifs_set_range_uptodate(struct folio *folio, ··· 94 75 folio_mark_uptodate(folio); 95 76 } 96 77 97 - static inline bool ifs_block_is_dirty(struct folio *folio, 98 - struct iomap_folio_state *ifs, int block) 78 + /* 79 + * Find the next dirty block in the folio. end_blk is inclusive. 80 + * If no dirty block is found, this will return end_blk + 1. 81 + */ 82 + static unsigned ifs_next_dirty_block(struct folio *folio, 83 + unsigned start_blk, unsigned end_blk) 99 84 { 85 + struct iomap_folio_state *ifs = folio->private; 100 86 struct inode *inode = folio->mapping->host; 101 - unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); 87 + unsigned int blks = i_blocks_per_folio(inode, folio); 102 88 103 - return test_bit(block + blks_per_folio, ifs->state); 89 + return find_next_bit(ifs->state, blks + end_blk + 1, 90 + blks + start_blk) - blks; 91 + } 92 + 93 + /* 94 + * Find the next clean block in the folio. end_blk is inclusive. 95 + * If no clean block is found, this will return end_blk + 1. 96 + */ 97 + static unsigned ifs_next_clean_block(struct folio *folio, 98 + unsigned start_blk, unsigned end_blk) 99 + { 100 + struct iomap_folio_state *ifs = folio->private; 101 + struct inode *inode = folio->mapping->host; 102 + unsigned int blks = i_blocks_per_folio(inode, folio); 103 + 104 + return find_next_zero_bit(ifs->state, blks + end_blk + 1, 105 + blks + start_blk) - blks; 104 106 } 105 107 106 108 static unsigned ifs_find_dirty_range(struct folio *folio, ··· 132 92 offset_in_folio(folio, *range_start) >> inode->i_blkbits; 133 93 unsigned end_blk = min_not_zero( 134 94 offset_in_folio(folio, range_end) >> inode->i_blkbits, 135 - i_blocks_per_folio(inode, folio)); 136 - unsigned nblks = 1; 95 + i_blocks_per_folio(inode, folio)) - 1; 96 + unsigned nblks; 137 97 138 - while (!ifs_block_is_dirty(folio, ifs, start_blk)) 139 - if (++start_blk == end_blk) 140 - return 0; 141 - 142 - while (start_blk + nblks < end_blk) { 143 - if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks)) 144 - break; 145 - nblks++; 146 - } 98 + start_blk = ifs_next_dirty_block(folio, start_blk, end_blk); 99 + if (start_blk > end_blk) 100 + return 0; 101 + if (start_blk == end_blk) 102 + nblks = 1; 103 + else 104 + nblks = ifs_next_clean_block(folio, start_blk + 1, end_blk) - 105 + start_blk; 147 106 148 107 *range_start = folio_pos(folio) + (start_blk << inode->i_blkbits); 149 108 return nblks << inode->i_blkbits; ··· 257 218 } 258 219 259 220 /* 221 + * Calculate how many bytes to truncate based off the number of blocks to 222 + * truncate and the end position to start truncating from. 223 + */ 224 + static size_t iomap_bytes_to_truncate(loff_t end_pos, unsigned block_bits, 225 + unsigned blocks_truncated) 226 + { 227 + unsigned block_size = 1 << block_bits; 228 + unsigned block_offset = end_pos & (block_size - 1); 229 + 230 + if (!block_offset) 231 + return blocks_truncated << block_bits; 232 + 233 + return ((blocks_truncated - 1) << block_bits) + block_offset; 234 + } 235 + 236 + /* 260 237 * Calculate the range inside the folio that we actually need to read. 261 238 */ 262 239 static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, ··· 295 240 * to avoid reading in already uptodate ranges. 296 241 */ 297 242 if (ifs) { 298 - unsigned int i; 243 + unsigned int next, blocks_skipped; 299 244 300 - /* move forward for each leading block marked uptodate */ 301 - for (i = first; i <= last; i++) { 302 - if (!ifs_block_is_uptodate(ifs, i)) 303 - break; 304 - *pos += block_size; 305 - poff += block_size; 306 - plen -= block_size; 307 - first++; 245 + next = ifs_next_nonuptodate_block(folio, first, last); 246 + blocks_skipped = next - first; 247 + 248 + if (blocks_skipped) { 249 + unsigned long block_offset = *pos & (block_size - 1); 250 + unsigned bytes_skipped = 251 + (blocks_skipped << block_bits) - block_offset; 252 + 253 + *pos += bytes_skipped; 254 + poff += bytes_skipped; 255 + plen -= bytes_skipped; 308 256 } 257 + first = next; 309 258 310 259 /* truncate len if we find any trailing uptodate block(s) */ 311 - while (++i <= last) { 312 - if (ifs_block_is_uptodate(ifs, i)) { 313 - plen -= (last - i + 1) * block_size; 314 - last = i - 1; 315 - break; 260 + if (++next <= last) { 261 + next = ifs_next_uptodate_block(folio, next, last); 262 + if (next <= last) { 263 + plen -= iomap_bytes_to_truncate(*pos + plen, 264 + block_bits, last - next + 1); 265 + last = next - 1; 316 266 } 317 267 } 318 268 } ··· 331 271 unsigned end = offset_in_folio(folio, isize - 1) >> block_bits; 332 272 333 273 if (first <= end && last > end) 334 - plen -= (last - end) * block_size; 274 + plen -= iomap_bytes_to_truncate(*pos + plen, block_bits, 275 + last - end); 335 276 } 336 277 337 278 *offp = poff; ··· 381 320 return 0; 382 321 } 383 322 384 - #ifdef CONFIG_BLOCK 385 - static void iomap_finish_folio_read(struct folio *folio, size_t off, 386 - size_t len, int error) 323 + void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, 324 + int error) 387 325 { 388 326 struct iomap_folio_state *ifs = folio->private; 389 327 bool uptodate = !error; ··· 402 342 if (finished) 403 343 folio_end_read(folio, uptodate); 404 344 } 345 + EXPORT_SYMBOL_GPL(iomap_finish_folio_read); 405 346 406 - static void iomap_read_end_io(struct bio *bio) 347 + static void iomap_read_init(struct folio *folio) 407 348 { 408 - int error = blk_status_to_errno(bio->bi_status); 409 - struct folio_iter fi; 349 + struct iomap_folio_state *ifs = folio->private; 410 350 411 - bio_for_each_folio_all(fi, bio) 412 - iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error); 413 - bio_put(bio); 351 + if (ifs) { 352 + size_t len = folio_size(folio); 353 + 354 + /* 355 + * ifs->read_bytes_pending is used to track how many bytes are 356 + * read in asynchronously by the IO helper. We need to track 357 + * this so that we can know when the IO helper has finished 358 + * reading in all the necessary ranges of the folio and can end 359 + * the read. 360 + * 361 + * Increase ->read_bytes_pending by the folio size to start, and 362 + * add a +1 bias. We'll subtract the bias and any uptodate / 363 + * zeroed ranges that did not require IO in iomap_read_end() 364 + * after we're done processing the folio. 365 + * 366 + * We do this because otherwise, we would have to increment 367 + * ifs->read_bytes_pending every time a range in the folio needs 368 + * to be read in, which can get expensive since the spinlock 369 + * needs to be held whenever modifying ifs->read_bytes_pending. 370 + * 371 + * We add the bias to ensure the read has not been ended on the 372 + * folio when iomap_read_end() is called, even if the IO helper 373 + * has already finished reading in the entire folio. 374 + */ 375 + spin_lock_irq(&ifs->state_lock); 376 + WARN_ON_ONCE(ifs->read_bytes_pending != 0); 377 + ifs->read_bytes_pending = len + 1; 378 + spin_unlock_irq(&ifs->state_lock); 379 + } 414 380 } 415 381 416 - struct iomap_readpage_ctx { 417 - struct folio *cur_folio; 418 - bool cur_folio_in_bio; 419 - struct bio *bio; 420 - struct readahead_control *rac; 421 - }; 382 + /* 383 + * This ends IO if no bytes were submitted to an IO helper. 384 + * 385 + * Otherwise, this calibrates ifs->read_bytes_pending to represent only the 386 + * submitted bytes (see comment in iomap_read_init()). If all bytes submitted 387 + * have already been completed by the IO helper, then this will end the read. 388 + * Else the IO helper will end the read after all submitted ranges have been 389 + * read. 390 + */ 391 + static void iomap_read_end(struct folio *folio, size_t bytes_submitted) 392 + { 393 + struct iomap_folio_state *ifs = folio->private; 422 394 423 - static int iomap_readpage_iter(struct iomap_iter *iter, 424 - struct iomap_readpage_ctx *ctx) 395 + if (ifs) { 396 + bool end_read, uptodate; 397 + 398 + spin_lock_irq(&ifs->state_lock); 399 + if (!ifs->read_bytes_pending) { 400 + WARN_ON_ONCE(bytes_submitted); 401 + spin_unlock_irq(&ifs->state_lock); 402 + folio_unlock(folio); 403 + return; 404 + } 405 + 406 + /* 407 + * Subtract any bytes that were initially accounted to 408 + * read_bytes_pending but skipped for IO. The +1 accounts for 409 + * the bias we added in iomap_read_init(). 410 + */ 411 + ifs->read_bytes_pending -= 412 + (folio_size(folio) + 1 - bytes_submitted); 413 + 414 + /* 415 + * If !ifs->read_bytes_pending, this means all pending reads by 416 + * the IO helper have already completed, which means we need to 417 + * end the folio read here. If ifs->read_bytes_pending != 0, 418 + * the IO helper will end the folio read. 419 + */ 420 + end_read = !ifs->read_bytes_pending; 421 + if (end_read) 422 + uptodate = ifs_is_fully_uptodate(folio, ifs); 423 + spin_unlock_irq(&ifs->state_lock); 424 + if (end_read) 425 + folio_end_read(folio, uptodate); 426 + } else if (!bytes_submitted) { 427 + /* 428 + * If there were no bytes submitted, this means we are 429 + * responsible for unlocking the folio here, since no IO helper 430 + * has taken ownership of it. If there were bytes submitted, 431 + * then the IO helper will end the read via 432 + * iomap_finish_folio_read(). 433 + */ 434 + folio_unlock(folio); 435 + } 436 + } 437 + 438 + static int iomap_read_folio_iter(struct iomap_iter *iter, 439 + struct iomap_read_folio_ctx *ctx, size_t *bytes_submitted) 425 440 { 426 441 const struct iomap *iomap = &iter->iomap; 427 442 loff_t pos = iter->pos; 428 443 loff_t length = iomap_length(iter); 429 444 struct folio *folio = ctx->cur_folio; 430 - struct iomap_folio_state *ifs; 431 445 size_t poff, plen; 432 - sector_t sector; 446 + loff_t pos_diff; 433 447 int ret; 434 448 435 449 if (iomap->type == IOMAP_INLINE) { 436 450 ret = iomap_read_inline_data(iter, folio); 437 451 if (ret) 438 452 return ret; 439 - return iomap_iter_advance(iter, &length); 453 + return iomap_iter_advance(iter, length); 440 454 } 441 455 442 - /* zero post-eof blocks as the page may be mapped */ 443 - ifs = ifs_alloc(iter->inode, folio, iter->flags); 444 - iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); 445 - if (plen == 0) 446 - goto done; 456 + ifs_alloc(iter->inode, folio, iter->flags); 447 457 448 - if (iomap_block_needs_zeroing(iter, pos)) { 449 - folio_zero_range(folio, poff, plen); 450 - iomap_set_range_uptodate(folio, poff, plen); 451 - goto done; 452 - } 458 + length = min_t(loff_t, length, 459 + folio_size(folio) - offset_in_folio(folio, pos)); 460 + while (length) { 461 + iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, 462 + &plen); 453 463 454 - ctx->cur_folio_in_bio = true; 455 - if (ifs) { 456 - spin_lock_irq(&ifs->state_lock); 457 - ifs->read_bytes_pending += plen; 458 - spin_unlock_irq(&ifs->state_lock); 459 - } 464 + pos_diff = pos - iter->pos; 465 + if (WARN_ON_ONCE(pos_diff + plen > length)) 466 + return -EIO; 460 467 461 - sector = iomap_sector(iomap, pos); 462 - if (!ctx->bio || 463 - bio_end_sector(ctx->bio) != sector || 464 - !bio_add_folio(ctx->bio, folio, plen, poff)) { 465 - gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); 466 - gfp_t orig_gfp = gfp; 467 - unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); 468 - 469 - if (ctx->bio) 470 - submit_bio(ctx->bio); 471 - 472 - if (ctx->rac) /* same as readahead_gfp_mask */ 473 - gfp |= __GFP_NORETRY | __GFP_NOWARN; 474 - ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs), 475 - REQ_OP_READ, gfp); 476 - /* 477 - * If the bio_alloc fails, try it again for a single page to 478 - * avoid having to deal with partial page reads. This emulates 479 - * what do_mpage_read_folio does. 480 - */ 481 - if (!ctx->bio) { 482 - ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ, 483 - orig_gfp); 484 - } 485 - if (ctx->rac) 486 - ctx->bio->bi_opf |= REQ_RAHEAD; 487 - ctx->bio->bi_iter.bi_sector = sector; 488 - ctx->bio->bi_end_io = iomap_read_end_io; 489 - bio_add_folio_nofail(ctx->bio, folio, plen, poff); 490 - } 491 - 492 - done: 493 - /* 494 - * Move the caller beyond our range so that it keeps making progress. 495 - * For that, we have to include any leading non-uptodate ranges, but 496 - * we can skip trailing ones as they will be handled in the next 497 - * iteration. 498 - */ 499 - length = pos - iter->pos + plen; 500 - return iomap_iter_advance(iter, &length); 501 - } 502 - 503 - static int iomap_read_folio_iter(struct iomap_iter *iter, 504 - struct iomap_readpage_ctx *ctx) 505 - { 506 - int ret; 507 - 508 - while (iomap_length(iter)) { 509 - ret = iomap_readpage_iter(iter, ctx); 468 + ret = iomap_iter_advance(iter, pos_diff); 510 469 if (ret) 511 470 return ret; 512 - } 513 471 472 + if (plen == 0) 473 + return 0; 474 + 475 + /* zero post-eof blocks as the page may be mapped */ 476 + if (iomap_block_needs_zeroing(iter, pos)) { 477 + folio_zero_range(folio, poff, plen); 478 + iomap_set_range_uptodate(folio, poff, plen); 479 + } else { 480 + if (!*bytes_submitted) 481 + iomap_read_init(folio); 482 + ret = ctx->ops->read_folio_range(iter, ctx, plen); 483 + if (ret) 484 + return ret; 485 + *bytes_submitted += plen; 486 + } 487 + 488 + ret = iomap_iter_advance(iter, plen); 489 + if (ret) 490 + return ret; 491 + length -= pos_diff + plen; 492 + pos = iter->pos; 493 + } 514 494 return 0; 515 495 } 516 496 517 - int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) 497 + void iomap_read_folio(const struct iomap_ops *ops, 498 + struct iomap_read_folio_ctx *ctx) 518 499 { 500 + struct folio *folio = ctx->cur_folio; 519 501 struct iomap_iter iter = { 520 502 .inode = folio->mapping->host, 521 503 .pos = folio_pos(folio), 522 504 .len = folio_size(folio), 523 505 }; 524 - struct iomap_readpage_ctx ctx = { 525 - .cur_folio = folio, 526 - }; 506 + size_t bytes_submitted = 0; 527 507 int ret; 528 508 529 509 trace_iomap_readpage(iter.inode, 1); 530 510 531 511 while ((ret = iomap_iter(&iter, ops)) > 0) 532 - iter.status = iomap_read_folio_iter(&iter, &ctx); 512 + iter.status = iomap_read_folio_iter(&iter, ctx, 513 + &bytes_submitted); 533 514 534 - if (ctx.bio) { 535 - submit_bio(ctx.bio); 536 - WARN_ON_ONCE(!ctx.cur_folio_in_bio); 537 - } else { 538 - WARN_ON_ONCE(ctx.cur_folio_in_bio); 539 - folio_unlock(folio); 540 - } 515 + if (ctx->ops->submit_read) 516 + ctx->ops->submit_read(ctx); 541 517 542 - /* 543 - * Just like mpage_readahead and block_read_full_folio, we always 544 - * return 0 and just set the folio error flag on errors. This 545 - * should be cleaned up throughout the stack eventually. 546 - */ 547 - return 0; 518 + iomap_read_end(folio, bytes_submitted); 548 519 } 549 520 EXPORT_SYMBOL_GPL(iomap_read_folio); 550 521 551 522 static int iomap_readahead_iter(struct iomap_iter *iter, 552 - struct iomap_readpage_ctx *ctx) 523 + struct iomap_read_folio_ctx *ctx, size_t *cur_bytes_submitted) 553 524 { 554 525 int ret; 555 526 556 527 while (iomap_length(iter)) { 557 528 if (ctx->cur_folio && 558 529 offset_in_folio(ctx->cur_folio, iter->pos) == 0) { 559 - if (!ctx->cur_folio_in_bio) 560 - folio_unlock(ctx->cur_folio); 530 + iomap_read_end(ctx->cur_folio, *cur_bytes_submitted); 561 531 ctx->cur_folio = NULL; 562 532 } 563 533 if (!ctx->cur_folio) { 564 534 ctx->cur_folio = readahead_folio(ctx->rac); 565 - ctx->cur_folio_in_bio = false; 535 + if (WARN_ON_ONCE(!ctx->cur_folio)) 536 + return -EINVAL; 537 + *cur_bytes_submitted = 0; 566 538 } 567 - ret = iomap_readpage_iter(iter, ctx); 539 + ret = iomap_read_folio_iter(iter, ctx, cur_bytes_submitted); 568 540 if (ret) 569 541 return ret; 570 542 } ··· 606 514 607 515 /** 608 516 * iomap_readahead - Attempt to read pages from a file. 609 - * @rac: Describes the pages to be read. 610 517 * @ops: The operations vector for the filesystem. 518 + * @ctx: The ctx used for issuing readahead. 611 519 * 612 520 * This function is for filesystems to call to implement their readahead 613 521 * address_space operation. ··· 619 527 * function is called with memalloc_nofs set, so allocations will not cause 620 528 * the filesystem to be reentered. 621 529 */ 622 - void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) 530 + void iomap_readahead(const struct iomap_ops *ops, 531 + struct iomap_read_folio_ctx *ctx) 623 532 { 533 + struct readahead_control *rac = ctx->rac; 624 534 struct iomap_iter iter = { 625 535 .inode = rac->mapping->host, 626 536 .pos = readahead_pos(rac), 627 537 .len = readahead_length(rac), 628 538 }; 629 - struct iomap_readpage_ctx ctx = { 630 - .rac = rac, 631 - }; 539 + size_t cur_bytes_submitted; 632 540 633 541 trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); 634 542 635 543 while (iomap_iter(&iter, ops) > 0) 636 - iter.status = iomap_readahead_iter(&iter, &ctx); 544 + iter.status = iomap_readahead_iter(&iter, ctx, 545 + &cur_bytes_submitted); 637 546 638 - if (ctx.bio) 639 - submit_bio(ctx.bio); 640 - if (ctx.cur_folio) { 641 - if (!ctx.cur_folio_in_bio) 642 - folio_unlock(ctx.cur_folio); 643 - } 547 + if (ctx->ops->submit_read) 548 + ctx->ops->submit_read(ctx); 549 + 550 + if (ctx->cur_folio) 551 + iomap_read_end(ctx->cur_folio, cur_bytes_submitted); 644 552 } 645 553 EXPORT_SYMBOL_GPL(iomap_readahead); 646 - 647 - static int iomap_read_folio_range(const struct iomap_iter *iter, 648 - struct folio *folio, loff_t pos, size_t len) 649 - { 650 - const struct iomap *srcmap = iomap_iter_srcmap(iter); 651 - struct bio_vec bvec; 652 - struct bio bio; 653 - 654 - bio_init(&bio, srcmap->bdev, &bvec, 1, REQ_OP_READ); 655 - bio.bi_iter.bi_sector = iomap_sector(srcmap, pos); 656 - bio_add_folio_nofail(&bio, folio, len, offset_in_folio(folio, pos)); 657 - return submit_bio_wait(&bio); 658 - } 659 - #else 660 - static int iomap_read_folio_range(const struct iomap_iter *iter, 661 - struct folio *folio, loff_t pos, size_t len) 662 - { 663 - WARN_ON_ONCE(1); 664 - return -EIO; 665 - } 666 - #endif /* CONFIG_BLOCK */ 667 554 668 555 /* 669 556 * iomap_is_partially_uptodate checks whether blocks within a folio are ··· 655 584 { 656 585 struct iomap_folio_state *ifs = folio->private; 657 586 struct inode *inode = folio->mapping->host; 658 - unsigned first, last, i; 587 + unsigned first, last; 659 588 660 589 if (!ifs) 661 590 return false; ··· 667 596 first = from >> inode->i_blkbits; 668 597 last = (from + count - 1) >> inode->i_blkbits; 669 598 670 - for (i = first; i <= last; i++) 671 - if (!ifs_block_is_uptodate(ifs, i)) 672 - return false; 673 - return true; 599 + return ifs_next_nonuptodate_block(folio, first, last) > last; 674 600 } 675 601 EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); 676 602 ··· 791 723 if (plen == 0) 792 724 break; 793 725 794 - if (!(iter->flags & IOMAP_UNSHARE) && 795 - (from <= poff || from >= poff + plen) && 796 - (to <= poff || to >= poff + plen)) 726 + /* 727 + * If the read range will be entirely overwritten by the write, 728 + * we can skip having to zero/read it in. 729 + */ 730 + if (!(iter->flags & IOMAP_UNSHARE) && from <= poff && 731 + to >= poff + plen) 797 732 continue; 798 733 799 734 if (iomap_block_needs_zeroing(iter, block_start)) { ··· 813 742 status = write_ops->read_folio_range(iter, 814 743 folio, block_start, plen); 815 744 else 816 - status = iomap_read_folio_range(iter, 745 + status = iomap_bio_read_folio_range_sync(iter, 817 746 folio, block_start, plen); 818 747 if (status) 819 748 return status; ··· 831 760 832 761 if (!mapping_large_folio_support(iter->inode->i_mapping)) 833 762 len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); 763 + 764 + if (iter->fbatch) { 765 + struct folio *folio = folio_batch_next(iter->fbatch); 766 + 767 + if (!folio) 768 + return NULL; 769 + 770 + /* 771 + * The folio mapping generally shouldn't have changed based on 772 + * fs locks, but be consistent with filemap lookup and retry 773 + * the iter if it does. 774 + */ 775 + folio_lock(folio); 776 + if (unlikely(folio->mapping != iter->inode->i_mapping)) { 777 + iter->iomap.flags |= IOMAP_F_STALE; 778 + folio_unlock(folio); 779 + return NULL; 780 + } 781 + 782 + folio_get(folio); 783 + return folio; 784 + } 834 785 835 786 if (write_ops && write_ops->get_folio) 836 787 return write_ops->get_folio(iter, pos, len); ··· 908 815 size_t *poffset, u64 *plen) 909 816 { 910 817 const struct iomap *srcmap = iomap_iter_srcmap(iter); 911 - loff_t pos = iter->pos; 818 + loff_t pos; 912 819 u64 len = min_t(u64, SIZE_MAX, iomap_length(iter)); 913 820 struct folio *folio; 914 821 int status = 0; 915 822 916 823 len = min_not_zero(len, *plen); 917 - BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); 918 - if (srcmap != &iter->iomap) 919 - BUG_ON(pos + len > srcmap->offset + srcmap->length); 824 + *foliop = NULL; 825 + *plen = 0; 920 826 921 827 if (fatal_signal_pending(current)) 922 828 return -EINTR; ··· 923 831 folio = __iomap_get_folio(iter, write_ops, len); 924 832 if (IS_ERR(folio)) 925 833 return PTR_ERR(folio); 834 + 835 + /* 836 + * No folio means we're done with a batch. We still have range to 837 + * process so return and let the caller iterate and refill the batch. 838 + */ 839 + if (!folio) { 840 + WARN_ON_ONCE(!iter->fbatch); 841 + return 0; 842 + } 926 843 927 844 /* 928 845 * Now we have a locked folio, before we do anything with it we need to ··· 951 850 status = 0; 952 851 goto out_unlock; 953 852 } 853 + } 854 + 855 + /* 856 + * The folios in a batch may not be contiguous. If we've skipped 857 + * forward, advance the iter to the pos of the current folio. If the 858 + * folio starts beyond the end of the mapping, it may have been trimmed 859 + * since the lookup for whatever reason. Return a NULL folio to 860 + * terminate the op. 861 + */ 862 + if (folio_pos(folio) > iter->pos) { 863 + len = min_t(u64, folio_pos(folio) - iter->pos, 864 + iomap_length(iter)); 865 + status = iomap_iter_advance(iter, len); 866 + len = iomap_length(iter); 867 + if (status || !len) 868 + goto out_unlock; 954 869 } 955 870 956 871 pos = iomap_trim_folio_range(iter, folio, poffset, &len); ··· 1158 1041 } 1159 1042 } else { 1160 1043 total_written += written; 1161 - iomap_iter_advance(iter, &written); 1044 + iomap_iter_advance(iter, written); 1162 1045 } 1163 1046 } while (iov_iter_count(i) && iomap_length(iter)); 1164 1047 ··· 1199 1082 struct folio *folio, loff_t start_byte, loff_t end_byte, 1200 1083 struct iomap *iomap, iomap_punch_t punch) 1201 1084 { 1202 - unsigned int first_blk, last_blk, i; 1085 + unsigned int first_blk, last_blk; 1203 1086 loff_t last_byte; 1204 1087 u8 blkbits = inode->i_blkbits; 1205 1088 struct iomap_folio_state *ifs; ··· 1218 1101 folio_pos(folio) + folio_size(folio) - 1); 1219 1102 first_blk = offset_in_folio(folio, start_byte) >> blkbits; 1220 1103 last_blk = offset_in_folio(folio, last_byte) >> blkbits; 1221 - for (i = first_blk; i <= last_blk; i++) { 1222 - if (!ifs_block_is_dirty(folio, ifs, i)) 1223 - punch(inode, folio_pos(folio) + (i << blkbits), 1224 - 1 << blkbits, iomap); 1104 + while ((first_blk = ifs_next_clean_block(folio, first_blk, last_blk)) 1105 + <= last_blk) { 1106 + punch(inode, folio_pos(folio) + (first_blk << blkbits), 1107 + 1 << blkbits, iomap); 1108 + first_blk++; 1225 1109 } 1226 1110 } 1227 1111 ··· 1428 1310 int status; 1429 1311 1430 1312 if (!iomap_want_unshare_iter(iter)) 1431 - return iomap_iter_advance(iter, &bytes); 1313 + return iomap_iter_advance(iter, bytes); 1432 1314 1433 1315 do { 1434 1316 struct folio *folio; ··· 1452 1334 1453 1335 balance_dirty_pages_ratelimited(iter->inode->i_mapping); 1454 1336 1455 - status = iomap_iter_advance(iter, &bytes); 1337 + status = iomap_iter_advance(iter, bytes); 1456 1338 if (status) 1457 1339 break; 1458 - } while (bytes > 0); 1340 + } while ((bytes = iomap_length(iter)) > 0); 1459 1341 1460 1342 return status; 1461 1343 } ··· 1516 1398 if (iter->iomap.flags & IOMAP_F_STALE) 1517 1399 break; 1518 1400 1401 + /* a NULL folio means we're done with a folio batch */ 1402 + if (!folio) { 1403 + status = iomap_iter_advance_full(iter); 1404 + break; 1405 + } 1406 + 1519 1407 /* warn about zeroing folios beyond eof that won't write back */ 1520 1408 WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size); 1521 1409 ··· 1536 1412 if (WARN_ON_ONCE(!ret)) 1537 1413 return -EIO; 1538 1414 1539 - status = iomap_iter_advance(iter, &bytes); 1415 + status = iomap_iter_advance(iter, bytes); 1540 1416 if (status) 1541 1417 break; 1542 - } while (bytes > 0); 1418 + } while ((bytes = iomap_length(iter)) > 0); 1543 1419 1544 1420 if (did_zero) 1545 1421 *did_zero = true; 1546 1422 return status; 1547 1423 } 1424 + 1425 + loff_t 1426 + iomap_fill_dirty_folios( 1427 + struct iomap_iter *iter, 1428 + loff_t offset, 1429 + loff_t length) 1430 + { 1431 + struct address_space *mapping = iter->inode->i_mapping; 1432 + pgoff_t start = offset >> PAGE_SHIFT; 1433 + pgoff_t end = (offset + length - 1) >> PAGE_SHIFT; 1434 + 1435 + iter->fbatch = kmalloc(sizeof(struct folio_batch), GFP_KERNEL); 1436 + if (!iter->fbatch) 1437 + return offset + length; 1438 + folio_batch_init(iter->fbatch); 1439 + 1440 + filemap_get_folios_dirty(mapping, &start, end, iter->fbatch); 1441 + return (start << PAGE_SHIFT); 1442 + } 1443 + EXPORT_SYMBOL_GPL(iomap_fill_dirty_folios); 1548 1444 1549 1445 int 1550 1446 iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, ··· 1579 1435 .private = private, 1580 1436 }; 1581 1437 struct address_space *mapping = inode->i_mapping; 1582 - unsigned int blocksize = i_blocksize(inode); 1583 - unsigned int off = pos & (blocksize - 1); 1584 - loff_t plen = min_t(loff_t, len, blocksize - off); 1585 1438 int ret; 1586 1439 bool range_dirty; 1587 - 1588 - /* 1589 - * Zero range can skip mappings that are zero on disk so long as 1590 - * pagecache is clean. If pagecache was dirty prior to zero range, the 1591 - * mapping converts on writeback completion and so must be zeroed. 1592 - * 1593 - * The simplest way to deal with this across a range is to flush 1594 - * pagecache and process the updated mappings. To avoid excessive 1595 - * flushing on partial eof zeroing, special case it to zero the 1596 - * unaligned start portion if already dirty in pagecache. 1597 - */ 1598 - if (off && 1599 - filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) { 1600 - iter.len = plen; 1601 - while ((ret = iomap_iter(&iter, ops)) > 0) 1602 - iter.status = iomap_zero_iter(&iter, did_zero, 1603 - write_ops); 1604 - 1605 - iter.len = len - (iter.pos - pos); 1606 - if (ret || !iter.len) 1607 - return ret; 1608 - } 1609 1440 1610 1441 /* 1611 1442 * To avoid an unconditional flush, check pagecache state and only flush 1612 1443 * if dirty and the fs returns a mapping that might convert on 1613 1444 * writeback. 1614 1445 */ 1615 - range_dirty = filemap_range_needs_writeback(inode->i_mapping, 1616 - iter.pos, iter.pos + iter.len - 1); 1446 + range_dirty = filemap_range_needs_writeback(mapping, iter.pos, 1447 + iter.pos + iter.len - 1); 1617 1448 while ((ret = iomap_iter(&iter, ops)) > 0) { 1618 1449 const struct iomap *srcmap = iomap_iter_srcmap(&iter); 1619 1450 1620 - if (srcmap->type == IOMAP_HOLE || 1621 - srcmap->type == IOMAP_UNWRITTEN) { 1451 + if (WARN_ON_ONCE(iter.fbatch && 1452 + srcmap->type != IOMAP_UNWRITTEN)) 1453 + return -EIO; 1454 + 1455 + if (!iter.fbatch && 1456 + (srcmap->type == IOMAP_HOLE || 1457 + srcmap->type == IOMAP_UNWRITTEN)) { 1622 1458 s64 status; 1623 1459 1624 1460 if (range_dirty) { ··· 1650 1526 folio_mark_dirty(folio); 1651 1527 } 1652 1528 1653 - return iomap_iter_advance(iter, &length); 1529 + return iomap_iter_advance(iter, length); 1654 1530 } 1655 1531 1656 1532 vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, ··· 1683 1559 } 1684 1560 EXPORT_SYMBOL_GPL(iomap_page_mkwrite); 1685 1561 1686 - void iomap_start_folio_write(struct inode *inode, struct folio *folio, 1687 - size_t len) 1562 + static void iomap_writeback_init(struct inode *inode, struct folio *folio) 1688 1563 { 1689 1564 struct iomap_folio_state *ifs = folio->private; 1690 1565 1691 1566 WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); 1692 - if (ifs) 1693 - atomic_add(len, &ifs->write_bytes_pending); 1567 + if (ifs) { 1568 + WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); 1569 + /* 1570 + * Set this to the folio size. After processing the folio for 1571 + * writeback in iomap_writeback_folio(), we'll subtract any 1572 + * ranges not written back. 1573 + * 1574 + * We do this because otherwise, we would have to atomically 1575 + * increment ifs->write_bytes_pending every time a range in the 1576 + * folio needs to be written back. 1577 + */ 1578 + atomic_set(&ifs->write_bytes_pending, folio_size(folio)); 1579 + } 1694 1580 } 1695 - EXPORT_SYMBOL_GPL(iomap_start_folio_write); 1696 1581 1697 1582 void iomap_finish_folio_write(struct inode *inode, struct folio *folio, 1698 1583 size_t len) ··· 1718 1585 1719 1586 static int iomap_writeback_range(struct iomap_writepage_ctx *wpc, 1720 1587 struct folio *folio, u64 pos, u32 rlen, u64 end_pos, 1721 - bool *wb_pending) 1588 + size_t *bytes_submitted) 1722 1589 { 1723 1590 do { 1724 1591 ssize_t ret; ··· 1732 1599 pos += ret; 1733 1600 1734 1601 /* 1735 - * Holes are not be written back by ->writeback_range, so track 1602 + * Holes are not written back by ->writeback_range, so track 1736 1603 * if we did handle anything that is not a hole here. 1737 1604 */ 1738 1605 if (wpc->iomap.type != IOMAP_HOLE) 1739 - *wb_pending = true; 1606 + *bytes_submitted += ret; 1740 1607 } while (rlen); 1741 1608 1742 1609 return 0; ··· 1807 1674 u64 pos = folio_pos(folio); 1808 1675 u64 end_pos = pos + folio_size(folio); 1809 1676 u64 end_aligned = 0; 1810 - bool wb_pending = false; 1677 + size_t bytes_submitted = 0; 1811 1678 int error = 0; 1812 1679 u32 rlen; 1813 1680 ··· 1827 1694 iomap_set_range_dirty(folio, 0, end_pos - pos); 1828 1695 } 1829 1696 1830 - /* 1831 - * Keep the I/O completion handler from clearing the writeback 1832 - * bit until we have submitted all blocks by adding a bias to 1833 - * ifs->write_bytes_pending, which is dropped after submitting 1834 - * all blocks. 1835 - */ 1836 - WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); 1837 - iomap_start_folio_write(inode, folio, 1); 1697 + iomap_writeback_init(inode, folio); 1838 1698 } 1839 1699 1840 1700 /* ··· 1842 1716 end_aligned = round_up(end_pos, i_blocksize(inode)); 1843 1717 while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) { 1844 1718 error = iomap_writeback_range(wpc, folio, pos, rlen, end_pos, 1845 - &wb_pending); 1719 + &bytes_submitted); 1846 1720 if (error) 1847 1721 break; 1848 1722 pos += rlen; 1849 1723 } 1850 1724 1851 - if (wb_pending) 1725 + if (bytes_submitted) 1852 1726 wpc->nr_folios++; 1853 1727 1854 1728 /* ··· 1866 1740 * bit ourselves right after unlocking the page. 1867 1741 */ 1868 1742 if (ifs) { 1869 - if (atomic_dec_and_test(&ifs->write_bytes_pending)) 1870 - folio_end_writeback(folio); 1871 - } else { 1872 - if (!wb_pending) 1873 - folio_end_writeback(folio); 1743 + /* 1744 + * Subtract any bytes that were initially accounted to 1745 + * write_bytes_pending but skipped for writeback. 1746 + */ 1747 + size_t bytes_not_submitted = folio_size(folio) - 1748 + bytes_submitted; 1749 + 1750 + if (bytes_not_submitted) 1751 + iomap_finish_folio_write(inode, folio, 1752 + bytes_not_submitted); 1753 + } else if (!bytes_submitted) { 1754 + folio_end_writeback(folio); 1874 1755 } 1756 + 1875 1757 mapping_set_error(inode->i_mapping, error); 1876 1758 return error; 1877 1759 }

+127 -105

fs/iomap/direct-io.c

··· 16 16 * Private flags for iomap_dio, must not overlap with the public ones in 17 17 * iomap.h: 18 18 */ 19 - #define IOMAP_DIO_NO_INVALIDATE (1U << 25) 20 - #define IOMAP_DIO_CALLER_COMP (1U << 26) 21 - #define IOMAP_DIO_INLINE_COMP (1U << 27) 19 + #define IOMAP_DIO_NO_INVALIDATE (1U << 26) 20 + #define IOMAP_DIO_COMP_WORK (1U << 27) 22 21 #define IOMAP_DIO_WRITE_THROUGH (1U << 28) 23 22 #define IOMAP_DIO_NEED_SYNC (1U << 29) 24 23 #define IOMAP_DIO_WRITE (1U << 30) ··· 139 140 } 140 141 EXPORT_SYMBOL_GPL(iomap_dio_complete); 141 142 142 - static ssize_t iomap_dio_deferred_complete(void *data) 143 - { 144 - return iomap_dio_complete(data); 145 - } 146 - 147 143 static void iomap_dio_complete_work(struct work_struct *work) 148 144 { 149 145 struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); ··· 173 179 174 180 WRITE_ONCE(dio->submit.waiter, NULL); 175 181 blk_wake_io_task(waiter); 176 - } else if (dio->flags & IOMAP_DIO_INLINE_COMP) { 177 - WRITE_ONCE(iocb->private, NULL); 178 - iomap_dio_complete_work(&dio->aio.work); 179 - } else if (dio->flags & IOMAP_DIO_CALLER_COMP) { 180 - /* 181 - * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then 182 - * schedule our completion that way to avoid an async punt to a 183 - * workqueue. 184 - */ 185 - /* only polled IO cares about private cleared */ 186 - iocb->private = dio; 187 - iocb->dio_complete = iomap_dio_deferred_complete; 182 + return; 183 + } 188 184 189 - /* 190 - * Invoke ->ki_complete() directly. We've assigned our 191 - * dio_complete callback handler, and since the issuer set 192 - * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will 193 - * notice ->dio_complete being set and will defer calling that 194 - * handler until it can be done from a safe task context. 195 - * 196 - * Note that the 'res' being passed in here is not important 197 - * for this case. The actual completion value of the request 198 - * will be gotten from dio_complete when that is run by the 199 - * issuer. 200 - */ 201 - iocb->ki_complete(iocb, 0); 202 - } else { 185 + /* 186 + * Always run error completions in user context. These are not 187 + * performance critical and some code relies on taking sleeping locks 188 + * for error handling. 189 + */ 190 + if (dio->error) 191 + dio->flags |= IOMAP_DIO_COMP_WORK; 192 + 193 + /* 194 + * Never invalidate pages from this context to avoid deadlocks with 195 + * buffered I/O completions when called from the ioend workqueue, 196 + * or avoid sleeping when called directly from ->bi_end_io. 197 + * Tough luck if you hit the tiny race with someone dirtying the range 198 + * right between this check and the actual completion. 199 + */ 200 + if ((dio->flags & IOMAP_DIO_WRITE) && 201 + !(dio->flags & IOMAP_DIO_COMP_WORK)) { 202 + if (dio->iocb->ki_filp->f_mapping->nrpages) 203 + dio->flags |= IOMAP_DIO_COMP_WORK; 204 + else 205 + dio->flags |= IOMAP_DIO_NO_INVALIDATE; 206 + } 207 + 208 + if (dio->flags & IOMAP_DIO_COMP_WORK) { 203 209 struct inode *inode = file_inode(iocb->ki_filp); 204 210 205 211 /* ··· 210 216 */ 211 217 INIT_WORK(&dio->aio.work, iomap_dio_complete_work); 212 218 queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); 219 + return; 213 220 } 221 + 222 + WRITE_ONCE(iocb->private, NULL); 223 + iomap_dio_complete_work(&dio->aio.work); 214 224 } 215 225 216 226 void iomap_dio_bio_end_io(struct bio *bio) ··· 250 252 /* 251 253 * Try to avoid another context switch for the completion given 252 254 * that we are already called from the ioend completion 253 - * workqueue, but never invalidate pages from this thread to 254 - * avoid deadlocks with buffered I/O completions. Tough luck if 255 - * you hit the tiny race with someone dirtying the range now 256 - * between this check and the actual completion. 255 + * workqueue. 257 256 */ 258 - if (!dio->iocb->ki_filp->f_mapping->nrpages) { 259 - dio->flags |= IOMAP_DIO_INLINE_COMP; 260 - dio->flags |= IOMAP_DIO_NO_INVALIDATE; 261 - } 262 - dio->flags &= ~IOMAP_DIO_CALLER_COMP; 257 + dio->flags &= ~IOMAP_DIO_COMP_WORK; 263 258 iomap_dio_done(dio); 264 259 } 265 260 ··· 297 306 return 0; 298 307 } 299 308 300 - /* 301 - * Use a FUA write if we need datasync semantics and this is a pure data I/O 302 - * that doesn't require any metadata updates (including after I/O completion 303 - * such as unwritten extent conversion) and the underlying device either 304 - * doesn't have a volatile write cache or supports FUA. 305 - * This allows us to avoid cache flushes on I/O completion. 306 - */ 307 - static inline bool iomap_dio_can_use_fua(const struct iomap *iomap, 308 - struct iomap_dio *dio) 309 - { 310 - if (iomap->flags & (IOMAP_F_SHARED | IOMAP_F_DIRTY)) 311 - return false; 312 - if (!(dio->flags & IOMAP_DIO_WRITE_THROUGH)) 313 - return false; 314 - return !bdev_write_cache(iomap->bdev) || bdev_fua(iomap->bdev); 315 - } 316 - 317 309 static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) 318 310 { 319 311 const struct iomap *iomap = &iter->iomap; ··· 310 336 int nr_pages, ret = 0; 311 337 u64 copied = 0; 312 338 size_t orig_count; 339 + unsigned int alignment; 313 340 314 - if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1)) 341 + /* 342 + * File systems that write out of place and always allocate new blocks 343 + * need each bio to be block aligned as that's the unit of allocation. 344 + */ 345 + if (dio->flags & IOMAP_DIO_FSBLOCK_ALIGNED) 346 + alignment = fs_block_size; 347 + else 348 + alignment = bdev_logical_block_size(iomap->bdev); 349 + 350 + if ((pos | length) & (alignment - 1)) 315 351 return -EINVAL; 316 352 317 353 if (dio->flags & IOMAP_DIO_WRITE) { 318 - bio_opf |= REQ_OP_WRITE; 354 + bool need_completion_work = true; 355 + 356 + switch (iomap->type) { 357 + case IOMAP_MAPPED: 358 + /* 359 + * Directly mapped I/O does not inherently need to do 360 + * work at I/O completion time. But there are various 361 + * cases below where this will get set again. 362 + */ 363 + need_completion_work = false; 364 + break; 365 + case IOMAP_UNWRITTEN: 366 + dio->flags |= IOMAP_DIO_UNWRITTEN; 367 + need_zeroout = true; 368 + break; 369 + default: 370 + break; 371 + } 319 372 320 373 if (iomap->flags & IOMAP_F_ATOMIC_BIO) { 321 374 /* ··· 355 354 bio_opf |= REQ_ATOMIC; 356 355 } 357 356 358 - if (iomap->type == IOMAP_UNWRITTEN) { 359 - dio->flags |= IOMAP_DIO_UNWRITTEN; 357 + if (iomap->flags & IOMAP_F_SHARED) { 358 + /* 359 + * Unsharing of needs to update metadata at I/O 360 + * completion time. 361 + */ 362 + need_completion_work = true; 363 + dio->flags |= IOMAP_DIO_COW; 364 + } 365 + 366 + if (iomap->flags & IOMAP_F_NEW) { 367 + /* 368 + * Newly allocated blocks might need recording in 369 + * metadata at I/O completion time. 370 + */ 371 + need_completion_work = true; 360 372 need_zeroout = true; 361 373 } 362 374 363 - if (iomap->flags & IOMAP_F_SHARED) 364 - dio->flags |= IOMAP_DIO_COW; 365 - 366 - if (iomap->flags & IOMAP_F_NEW) 367 - need_zeroout = true; 368 - else if (iomap->type == IOMAP_MAPPED && 369 - iomap_dio_can_use_fua(iomap, dio)) 370 - bio_opf |= REQ_FUA; 371 - 372 - if (!(bio_opf & REQ_FUA)) 373 - dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; 375 + /* 376 + * Use a FUA write if we need datasync semantics and this is a 377 + * pure overwrite that doesn't require any metadata updates. 378 + * 379 + * This allows us to avoid cache flushes on I/O completion. 380 + */ 381 + if (dio->flags & IOMAP_DIO_WRITE_THROUGH) { 382 + if (!need_completion_work && 383 + !(iomap->flags & IOMAP_F_DIRTY) && 384 + (!bdev_write_cache(iomap->bdev) || 385 + bdev_fua(iomap->bdev))) 386 + bio_opf |= REQ_FUA; 387 + else 388 + dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; 389 + } 374 390 375 391 /* 376 - * We can only do deferred completion for pure overwrites that 392 + * We can only do inline completion for pure overwrites that 377 393 * don't require additional I/O at completion time. 378 394 * 379 - * This rules out writes that need zeroing or extent conversion, 380 - * extend the file size, or issue metadata I/O or cache flushes 381 - * during completion processing. 395 + * This rules out writes that need zeroing or metdata updates to 396 + * convert unwritten or shared extents. 397 + * 398 + * Writes that extend i_size are also not supported, but this is 399 + * handled in __iomap_dio_rw(). 382 400 */ 383 - if (need_zeroout || (pos >= i_size_read(inode)) || 384 - ((dio->flags & IOMAP_DIO_NEED_SYNC) && 385 - !(bio_opf & REQ_FUA))) 386 - dio->flags &= ~IOMAP_DIO_CALLER_COMP; 401 + if (need_completion_work) 402 + dio->flags |= IOMAP_DIO_COMP_WORK; 403 + 404 + bio_opf |= REQ_OP_WRITE; 387 405 } else { 388 406 bio_opf |= REQ_OP_READ; 389 407 } ··· 423 403 * ones we set for inline and deferred completions. If none of those 424 404 * are available for this IO, clear the polled flag. 425 405 */ 426 - if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP))) 406 + if (dio->flags & IOMAP_DIO_COMP_WORK) 427 407 dio->iocb->ki_flags &= ~IOCB_HIPRI; 428 408 429 409 if (need_zeroout) { ··· 454 434 bio->bi_end_io = iomap_dio_bio_end_io; 455 435 456 436 ret = bio_iov_iter_get_pages(bio, dio->submit.iter, 457 - bdev_logical_block_size(iomap->bdev) - 1); 437 + alignment - 1); 458 438 if (unlikely(ret)) { 459 439 /* 460 440 * We have to stop part way through an IO. We must fall ··· 516 496 /* Undo iter limitation to current extent */ 517 497 iov_iter_reexpand(dio->submit.iter, orig_count - copied); 518 498 if (copied) 519 - return iomap_iter_advance(iter, &copied); 499 + return iomap_iter_advance(iter, copied); 520 500 return ret; 521 501 } 522 502 ··· 527 507 dio->size += length; 528 508 if (!length) 529 509 return -EFAULT; 530 - return iomap_iter_advance(iter, &length); 510 + return iomap_iter_advance(iter, length); 531 511 } 532 512 533 513 static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) ··· 562 542 dio->size += copied; 563 543 if (!copied) 564 544 return -EFAULT; 565 - return iomap_iter_advance(iomi, &copied); 545 + return iomap_iter_advance(iomi, copied); 566 546 } 567 547 568 548 static int iomap_dio_iter(struct iomap_iter *iter, struct iomap_dio *dio) ··· 659 639 if (iocb->ki_flags & IOCB_NOWAIT) 660 640 iomi.flags |= IOMAP_NOWAIT; 661 641 662 - if (iov_iter_rw(iter) == READ) { 663 - /* reads can always complete inline */ 664 - dio->flags |= IOMAP_DIO_INLINE_COMP; 642 + if (dio_flags & IOMAP_DIO_FSBLOCK_ALIGNED) 643 + dio->flags |= IOMAP_DIO_FSBLOCK_ALIGNED; 665 644 645 + if (iov_iter_rw(iter) == READ) { 666 646 if (iomi.pos >= dio->i_size) 667 647 goto out_free_dio; 668 648 ··· 675 655 } else { 676 656 iomi.flags |= IOMAP_WRITE; 677 657 dio->flags |= IOMAP_DIO_WRITE; 678 - 679 - /* 680 - * Flag as supporting deferred completions, if the issuer 681 - * groks it. This can avoid a workqueue punt for writes. 682 - * We may later clear this flag if we need to do other IO 683 - * as part of this IO completion. 684 - */ 685 - if (iocb->ki_flags & IOCB_DIO_CALLER_COMP) 686 - dio->flags |= IOMAP_DIO_CALLER_COMP; 687 658 688 659 if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { 689 660 ret = -EAGAIN; ··· 705 694 } 706 695 707 696 /* 697 + * i_size updates must to happen from process context. 698 + */ 699 + if (iomi.pos + iomi.len > dio->i_size) 700 + dio->flags |= IOMAP_DIO_COMP_WORK; 701 + 702 + /* 708 703 * Try to invalidate cache pages for the range we are writing. 709 704 * If this invalidation fails, let the caller fall back to 710 705 * buffered I/O. ··· 734 717 } 735 718 goto out_free_dio; 736 719 } 720 + } 737 721 738 - if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) { 739 - ret = sb_init_dio_done_wq(inode->i_sb); 740 - if (ret < 0) 741 - goto out_free_dio; 742 - } 722 + if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) { 723 + ret = sb_init_dio_done_wq(inode->i_sb); 724 + if (ret < 0) 725 + goto out_free_dio; 743 726 } 744 727 745 728 inode_dio_begin(inode); ··· 782 765 * If all the writes we issued were already written through to the 783 766 * media, we don't need to flush the cache on IO completion. Clear the 784 767 * sync flag for this case. 768 + * 769 + * Otherwise clear the inline completion flag if any sync work is 770 + * needed, as that needs to be performed from process context. 785 771 */ 786 772 if (dio->flags & IOMAP_DIO_WRITE_THROUGH) 787 773 dio->flags &= ~IOMAP_DIO_NEED_SYNC; 774 + else if (dio->flags & IOMAP_DIO_NEED_SYNC) 775 + dio->flags |= IOMAP_DIO_COMP_WORK; 788 776 789 777 /* 790 778 * We are about to drop our additional submission reference, which

+12

fs/iomap/internal.h

··· 6 6 7 7 u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend); 8 8 9 + #ifdef CONFIG_BLOCK 10 + int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter, 11 + struct folio *folio, loff_t pos, size_t len); 12 + #else 13 + static inline int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter, 14 + struct folio *folio, loff_t pos, size_t len) 15 + { 16 + WARN_ON_ONCE(1); 17 + return -EIO; 18 + } 19 + #endif /* CONFIG_BLOCK */ 20 + 9 21 #endif /* _IOMAP_INTERNAL_H */

-2

fs/iomap/ioend.c

··· 194 194 if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff)) 195 195 goto new_ioend; 196 196 197 - iomap_start_folio_write(wpc->inode, folio, map_len); 198 - 199 197 /* 200 198 * Clamp io_offset and io_size to the incore EOF so that ondisk 201 199 * file size updates in the ioend completion are byte-accurate.

+11 -9

fs/iomap/iter.c

··· 8 8 9 9 static inline void iomap_iter_reset_iomap(struct iomap_iter *iter) 10 10 { 11 + if (iter->fbatch) { 12 + folio_batch_release(iter->fbatch); 13 + kfree(iter->fbatch); 14 + iter->fbatch = NULL; 15 + } 16 + 11 17 iter->status = 0; 12 18 memset(&iter->iomap, 0, sizeof(iter->iomap)); 13 19 memset(&iter->srcmap, 0, sizeof(iter->srcmap)); 14 20 } 15 21 16 - /* 17 - * Advance the current iterator position and output the length remaining for the 18 - * current mapping. 19 - */ 20 - int iomap_iter_advance(struct iomap_iter *iter, u64 *count) 22 + /* Advance the current iterator position and decrement the remaining length */ 23 + int iomap_iter_advance(struct iomap_iter *iter, u64 count) 21 24 { 22 - if (WARN_ON_ONCE(*count > iomap_length(iter))) 25 + if (WARN_ON_ONCE(count > iomap_length(iter))) 23 26 return -EIO; 24 - iter->pos += *count; 25 - iter->len -= *count; 26 - *count = iomap_length(iter); 27 + iter->pos += count; 28 + iter->len -= count; 27 29 return 0; 28 30 } 29 31

+4 -4

fs/iomap/seek.c

··· 16 16 *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, 17 17 iter->pos, iter->pos + length, SEEK_HOLE); 18 18 if (*hole_pos == iter->pos + length) 19 - return iomap_iter_advance(iter, &length); 19 + return iomap_iter_advance(iter, length); 20 20 return 0; 21 21 case IOMAP_HOLE: 22 22 *hole_pos = iter->pos; 23 23 return 0; 24 24 default: 25 - return iomap_iter_advance(iter, &length); 25 + return iomap_iter_advance(iter, length); 26 26 } 27 27 } 28 28 ··· 59 59 60 60 switch (iter->iomap.type) { 61 61 case IOMAP_HOLE: 62 - return iomap_iter_advance(iter, &length); 62 + return iomap_iter_advance(iter, length); 63 63 case IOMAP_UNWRITTEN: 64 64 *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, 65 65 iter->pos, iter->pos + length, SEEK_DATA); 66 66 if (*hole_pos < 0) 67 - return iomap_iter_advance(iter, &length); 67 + return iomap_iter_advance(iter, length); 68 68 return 0; 69 69 default: 70 70 *hole_pos = iter->pos;

+4 -3

fs/iomap/trace.h

··· 122 122 123 123 124 124 #define IOMAP_DIO_STRINGS \ 125 - {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \ 126 - {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \ 127 - {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" } 125 + {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \ 126 + {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \ 127 + {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" }, \ 128 + {IOMAP_DIO_FSBLOCK_ALIGNED, "DIO_FSBLOCK_ALIGNED" } 128 129 129 130 DECLARE_EVENT_CLASS(iomap_class, 130 131 TP_PROTO(struct inode *inode, struct iomap *iomap),

+4 -2

fs/xfs/libxfs/xfs_errortag.h

··· 73 73 #define XFS_ERRTAG_WRITE_DELAY_MS 43 74 74 #define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44 75 75 #define XFS_ERRTAG_METAFILE_RESV_CRITICAL 45 76 - #define XFS_ERRTAG_MAX 46 76 + #define XFS_ERRTAG_FORCE_ZERO_RANGE 46 77 + #define XFS_ERRTAG_MAX 47 77 78 78 79 /* 79 80 * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. ··· 134 133 XFS_ERRTAG(WB_DELAY_MS, wb_delay_ms, 3000) \ 135 134 XFS_ERRTAG(WRITE_DELAY_MS, write_delay_ms, 3000) \ 136 135 XFS_ERRTAG(EXCHMAPS_FINISH_ONE, exchmaps_finish_one, 1) \ 137 - XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) 136 + XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) \ 137 + XFS_ERRTAG(FORCE_ZERO_RANGE, force_zero_range, 4) 138 138 #endif /* XFS_ERRTAG */ 139 139 140 140 #endif /* __XFS_ERRORTAG_H_ */

+3 -2

fs/xfs/xfs_aops.c

··· 742 742 struct file *unused, 743 743 struct folio *folio) 744 744 { 745 - return iomap_read_folio(folio, &xfs_read_iomap_ops); 745 + iomap_bio_read_folio(folio, &xfs_read_iomap_ops); 746 + return 0; 746 747 } 747 748 748 749 STATIC void 749 750 xfs_vm_readahead( 750 751 struct readahead_control *rac) 751 752 { 752 - iomap_readahead(rac, &xfs_read_iomap_ops); 753 + iomap_bio_readahead(rac, &xfs_read_iomap_ops); 753 754 } 754 755 755 756 static int

+33 -17

fs/xfs/xfs_file.c

··· 27 27 #include "xfs_file.h" 28 28 #include "xfs_aops.h" 29 29 #include "xfs_zone_alloc.h" 30 + #include "xfs_error.h" 31 + #include "xfs_errortag.h" 30 32 31 33 #include <linux/dax.h> 32 34 #include <linux/falloc.h> ··· 676 674 struct xfs_zone_alloc_ctx *ac) 677 675 { 678 676 unsigned int iolock = XFS_IOLOCK_SHARED; 677 + unsigned int dio_flags = 0; 679 678 ssize_t ret; 679 + 680 + /* 681 + * For always COW inodes, each bio must be aligned to the file system 682 + * block size and not just the device sector size because we need to 683 + * allocate a block-aligned amount of space for each write. 684 + */ 685 + if (xfs_is_always_cow_inode(ip)) 686 + dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED; 680 687 681 688 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 682 689 if (ret) ··· 704 693 iolock = XFS_IOLOCK_SHARED; 705 694 } 706 695 trace_xfs_file_direct_write(iocb, from); 707 - ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0); 696 + ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0); 708 697 out_unlock: 709 698 xfs_iunlock(ip, iolock); 710 699 return ret; ··· 901 890 if ((iocb->ki_pos | count) & target->bt_logical_sectormask) 902 891 return -EINVAL; 903 892 904 - /* 905 - * For always COW inodes we also must check the alignment of each 906 - * individual iovec segment, as they could end up with different 907 - * I/Os due to the way bio_iov_iter_get_pages works, and we'd 908 - * then overwrite an already written block. 909 - */ 910 - if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) || 911 - (xfs_is_always_cow_inode(ip) && 912 - (iov_iter_alignment(from) & ip->i_mount->m_blockmask))) 893 + if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) 913 894 return xfs_file_dio_write_unaligned(ip, iocb, from); 914 895 if (xfs_is_zoned_inode(ip)) 915 896 return xfs_file_dio_write_zoned(ip, iocb, from); ··· 1257 1254 struct xfs_zone_alloc_ctx *ac) 1258 1255 { 1259 1256 struct inode *inode = file_inode(file); 1257 + struct xfs_inode *ip = XFS_I(inode); 1260 1258 unsigned int blksize = i_blocksize(inode); 1261 1259 loff_t new_size = 0; 1262 1260 int error; 1263 1261 1264 - trace_xfs_zero_file_space(XFS_I(inode)); 1262 + trace_xfs_zero_file_space(ip); 1265 1263 1266 1264 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1267 1265 if (error) 1268 1266 return error; 1269 1267 1270 - error = xfs_free_file_space(XFS_I(inode), offset, len, ac); 1271 - if (error) 1272 - return error; 1268 + /* 1269 + * Zero range implements a full zeroing mechanism but is only used in 1270 + * limited situations. It is more efficient to allocate unwritten 1271 + * extents than to perform zeroing here, so use an errortag to randomly 1272 + * force zeroing on DEBUG kernels for added test coverage. 1273 + */ 1274 + if (XFS_TEST_ERROR(ip->i_mount, 1275 + XFS_ERRTAG_FORCE_ZERO_RANGE)) { 1276 + error = xfs_zero_range(ip, offset, len, ac, NULL); 1277 + } else { 1278 + error = xfs_free_file_space(ip, offset, len, ac); 1279 + if (error) 1280 + return error; 1273 1281 1274 - len = round_up(offset + len, blksize) - round_down(offset, blksize); 1275 - offset = round_down(offset, blksize); 1276 - error = xfs_alloc_file_space(XFS_I(inode), offset, len); 1282 + len = round_up(offset + len, blksize) - 1283 + round_down(offset, blksize); 1284 + offset = round_down(offset, blksize); 1285 + error = xfs_alloc_file_space(ip, offset, len); 1286 + } 1277 1287 if (error) 1278 1288 return error; 1279 1289 return xfs_falloc_setsize(file, new_size);

+30 -8

fs/xfs/xfs_iomap.c

··· 1758 1758 struct iomap *iomap, 1759 1759 struct iomap *srcmap) 1760 1760 { 1761 + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, 1762 + iomap); 1761 1763 struct xfs_inode *ip = XFS_I(inode); 1762 1764 struct xfs_mount *mp = ip->i_mount; 1763 1765 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); ··· 1825 1823 } 1826 1824 1827 1825 /* 1828 - * For zeroing, trim a delalloc extent that extends beyond the EOF 1829 - * block. If it starts beyond the EOF block, convert it to an 1826 + * For zeroing, trim extents that extend beyond the EOF block. If a 1827 + * delalloc extent starts beyond the EOF block, convert it to an 1830 1828 * unwritten extent. 1831 1829 */ 1832 - if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb && 1833 - isnullstartblock(imap.br_startblock)) { 1830 + if (flags & IOMAP_ZERO) { 1834 1831 xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); 1832 + u64 end; 1835 1833 1836 - if (offset_fsb >= eof_fsb) 1834 + if (isnullstartblock(imap.br_startblock) && 1835 + offset_fsb >= eof_fsb) 1837 1836 goto convert_delay; 1838 - if (end_fsb > eof_fsb) { 1837 + if (offset_fsb < eof_fsb && end_fsb > eof_fsb) 1839 1838 end_fsb = eof_fsb; 1840 - xfs_trim_extent(&imap, offset_fsb, 1841 - end_fsb - offset_fsb); 1839 + 1840 + /* 1841 + * Look up dirty folios for unwritten mappings within EOF. 1842 + * Providing this bypasses the flush iomap uses to trigger 1843 + * extent conversion when unwritten mappings have dirty 1844 + * pagecache in need of zeroing. 1845 + * 1846 + * Trim the mapping to the end pos of the lookup, which in turn 1847 + * was trimmed to the end of the batch if it became full before 1848 + * the end of the mapping. 1849 + */ 1850 + if (imap.br_state == XFS_EXT_UNWRITTEN && 1851 + offset_fsb < eof_fsb) { 1852 + loff_t len = min(count, 1853 + XFS_FSB_TO_B(mp, imap.br_blockcount)); 1854 + 1855 + end = iomap_fill_dirty_folios(iter, offset, len); 1856 + end_fsb = min_t(xfs_fileoff_t, end_fsb, 1857 + XFS_B_TO_FSB(mp, end)); 1842 1858 } 1859 + 1860 + xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); 1843 1861 } 1844 1862 1845 1863 /*

+3 -2

fs/zonefs/file.c

··· 112 112 113 113 static int zonefs_read_folio(struct file *unused, struct folio *folio) 114 114 { 115 - return iomap_read_folio(folio, &zonefs_read_iomap_ops); 115 + iomap_bio_read_folio(folio, &zonefs_read_iomap_ops); 116 + return 0; 116 117 } 117 118 118 119 static void zonefs_readahead(struct readahead_control *rac) 119 120 { 120 - iomap_readahead(rac, &zonefs_read_iomap_ops); 121 + iomap_bio_readahead(rac, &zonefs_read_iomap_ops); 121 122 } 122 123 123 124 /*

+9 -34

include/linux/fs.h

··· 367 367 #define IOCB_NOIO (1 << 20) 368 368 /* can use bio alloc cache */ 369 369 #define IOCB_ALLOC_CACHE (1 << 21) 370 - /* 371 - * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the 372 - * iocb completion can be passed back to the owner for execution from a safe 373 - * context rather than needing to be punted through a workqueue. If this 374 - * flag is set, the bio completion handling may set iocb->dio_complete to a 375 - * handler function and iocb->private to context information for that handler. 376 - * The issuer should call the handler with that context information from task 377 - * context to complete the processing of the iocb. Note that while this 378 - * provides a task context for the dio_complete() callback, it should only be 379 - * used on the completion side for non-IO generating completions. It's fine to 380 - * call blocking functions from this callback, but they should not wait for 381 - * unrelated IO (like cache flushing, new IO generation, etc). 382 - */ 383 - #define IOCB_DIO_CALLER_COMP (1 << 22) 384 370 /* kiocb is a read or write operation submitted by fs/aio.c. */ 385 - #define IOCB_AIO_RW (1 << 23) 386 - #define IOCB_HAS_METADATA (1 << 24) 371 + #define IOCB_AIO_RW (1 << 22) 372 + #define IOCB_HAS_METADATA (1 << 23) 387 373 388 374 /* for use in trace events */ 389 375 #define TRACE_IOCB_STRINGS \ ··· 386 400 { IOCB_WAITQ, "WAITQ" }, \ 387 401 { IOCB_NOIO, "NOIO" }, \ 388 402 { IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \ 389 - { IOCB_DIO_CALLER_COMP, "CALLER_COMP" }, \ 390 403 { IOCB_AIO_RW, "AIO_RW" }, \ 391 404 { IOCB_HAS_METADATA, "AIO_HAS_METADATA" } 392 405 ··· 397 412 int ki_flags; 398 413 u16 ki_ioprio; /* See linux/ioprio.h */ 399 414 u8 ki_write_stream; 400 - union { 401 - /* 402 - * Only used for async buffered reads, where it denotes the 403 - * page waitqueue associated with completing the read. Valid 404 - * IFF IOCB_WAITQ is set. 405 - */ 406 - struct wait_page_queue *ki_waitq; 407 - /* 408 - * Can be used for O_DIRECT IO, where the completion handling 409 - * is punted back to the issuer of the IO. May only be set 410 - * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer 411 - * must then check for presence of this handler when ki_complete 412 - * is invoked. The data passed in to this handler must be 413 - * assigned to ->private when dio_complete is assigned. 414 - */ 415 - ssize_t (*dio_complete)(void *data); 416 - }; 415 + 416 + /* 417 + * Only used for async buffered reads, where it denotes the page 418 + * waitqueue associated with completing the read. 419 + * Valid IFF IOCB_WAITQ is set. 420 + */ 421 + struct wait_page_queue *ki_waitq; 417 422 }; 418 423 419 424 static inline bool is_sync_kiocb(struct kiocb *kiocb)

+78 -8

include/linux/iomap.h

··· 9 9 #include <linux/types.h> 10 10 #include <linux/mm_types.h> 11 11 #include <linux/blkdev.h> 12 + #include <linux/pagevec.h> 12 13 13 14 struct address_space; 14 15 struct fiemap_extent_info; ··· 17 16 struct iomap_iter; 18 17 struct iomap_dio; 19 18 struct iomap_writepage_ctx; 19 + struct iomap_read_folio_ctx; 20 20 struct iov_iter; 21 21 struct kiocb; 22 22 struct page; ··· 243 241 unsigned flags; 244 242 struct iomap iomap; 245 243 struct iomap srcmap; 244 + struct folio_batch *fbatch; 246 245 void *private; 247 246 }; 248 247 249 248 int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); 250 - int iomap_iter_advance(struct iomap_iter *iter, u64 *count); 249 + int iomap_iter_advance(struct iomap_iter *iter, u64 count); 251 250 252 251 /** 253 252 * iomap_length_trim - trimmed length of the current iomap iteration ··· 285 282 */ 286 283 static inline int iomap_iter_advance_full(struct iomap_iter *iter) 287 284 { 288 - u64 length = iomap_length(iter); 289 - 290 - return iomap_iter_advance(iter, &length); 285 + return iomap_iter_advance(iter, iomap_length(iter)); 291 286 } 292 287 293 288 /** ··· 340 339 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, 341 340 const struct iomap_ops *ops, 342 341 const struct iomap_write_ops *write_ops, void *private); 343 - int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); 344 - void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); 342 + void iomap_read_folio(const struct iomap_ops *ops, 343 + struct iomap_read_folio_ctx *ctx); 344 + void iomap_readahead(const struct iomap_ops *ops, 345 + struct iomap_read_folio_ctx *ctx); 345 346 bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); 346 347 struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len); 347 348 bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); ··· 352 349 int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, 353 350 const struct iomap_ops *ops, 354 351 const struct iomap_write_ops *write_ops); 352 + loff_t iomap_fill_dirty_folios(struct iomap_iter *iter, loff_t offset, 353 + loff_t length); 355 354 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, 356 355 bool *did_zero, const struct iomap_ops *ops, 357 356 const struct iomap_write_ops *write_ops, void *private); ··· 435 430 * An existing mapping from a previous call to this method can be reused 436 431 * by the file system if it is still valid. 437 432 * 433 + * If this succeeds, iomap_finish_folio_write() must be called once 434 + * writeback completes for the range, regardless of whether the 435 + * writeback succeeded or failed. 436 + * 438 437 * Returns the number of bytes processed or a negative errno. 439 438 */ 440 439 ssize_t (*writeback_range)(struct iomap_writepage_ctx *wpc, ··· 476 467 loff_t pos, loff_t end_pos, unsigned int dirty_len); 477 468 int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error); 478 469 479 - void iomap_start_folio_write(struct inode *inode, struct folio *folio, 480 - size_t len); 470 + void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, 471 + int error); 481 472 void iomap_finish_folio_write(struct inode *inode, struct folio *folio, 482 473 size_t len); 483 474 484 475 int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio); 485 476 int iomap_writepages(struct iomap_writepage_ctx *wpc); 477 + 478 + struct iomap_read_folio_ctx { 479 + const struct iomap_read_ops *ops; 480 + struct folio *cur_folio; 481 + struct readahead_control *rac; 482 + void *read_ctx; 483 + }; 484 + 485 + struct iomap_read_ops { 486 + /* 487 + * Read in a folio range. 488 + * 489 + * If this succeeds, iomap_finish_folio_read() must be called after the 490 + * range is read in, regardless of whether the read succeeded or failed. 491 + * 492 + * Returns 0 on success or a negative error on failure. 493 + */ 494 + int (*read_folio_range)(const struct iomap_iter *iter, 495 + struct iomap_read_folio_ctx *ctx, size_t len); 496 + 497 + /* 498 + * Submit any pending read requests. 499 + * 500 + * This is optional. 501 + */ 502 + void (*submit_read)(struct iomap_read_folio_ctx *ctx); 503 + }; 486 504 487 505 /* 488 506 * Flags for direct I/O ->end_io: ··· 554 518 */ 555 519 #define IOMAP_DIO_PARTIAL (1 << 2) 556 520 521 + /* 522 + * Ensure each bio is aligned to fs block size. 523 + * 524 + * For filesystems which need to calculate/verify the checksum of each fs 525 + * block. Otherwise they may not be able to handle unaligned bios. 526 + */ 527 + #define IOMAP_DIO_FSBLOCK_ALIGNED (1 << 3) 528 + 557 529 ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, 558 530 const struct iomap_ops *ops, const struct iomap_dio_ops *dops, 559 531 unsigned int dio_flags, void *private, size_t done_before); ··· 583 539 #endif /* CONFIG_SWAP */ 584 540 585 541 extern struct bio_set iomap_ioend_bioset; 542 + 543 + #ifdef CONFIG_BLOCK 544 + extern const struct iomap_read_ops iomap_bio_read_ops; 545 + 546 + static inline void iomap_bio_read_folio(struct folio *folio, 547 + const struct iomap_ops *ops) 548 + { 549 + struct iomap_read_folio_ctx ctx = { 550 + .ops = &iomap_bio_read_ops, 551 + .cur_folio = folio, 552 + }; 553 + 554 + iomap_read_folio(ops, &ctx); 555 + } 556 + 557 + static inline void iomap_bio_readahead(struct readahead_control *rac, 558 + const struct iomap_ops *ops) 559 + { 560 + struct iomap_read_folio_ctx ctx = { 561 + .ops = &iomap_bio_read_ops, 562 + .rac = rac, 563 + }; 564 + 565 + iomap_readahead(ops, &ctx); 566 + } 567 + #endif /* CONFIG_BLOCK */ 586 568 587 569 #endif /* LINUX_IOMAP_H */

+2

include/linux/pagemap.h

··· 977 977 pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); 978 978 unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, 979 979 pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch); 980 + unsigned filemap_get_folios_dirty(struct address_space *mapping, 981 + pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); 980 982 981 983 struct folio *read_cache_folio(struct address_space *, pgoff_t index, 982 984 filler_t *filler, struct file *file);

+2 -14

io_uring/rw.c

··· 277 277 } else { 278 278 rw->kiocb.ki_ioprio = get_current_ioprio(); 279 279 } 280 - rw->kiocb.dio_complete = NULL; 281 280 rw->kiocb.ki_flags = 0; 282 281 rw->kiocb.ki_write_stream = READ_ONCE(sqe->write_stream); 283 282 ··· 568 569 569 570 void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw) 570 571 { 571 - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 572 - struct kiocb *kiocb = &rw->kiocb; 573 - 574 - if ((kiocb->ki_flags & IOCB_DIO_CALLER_COMP) && kiocb->dio_complete) { 575 - long res = kiocb->dio_complete(rw->kiocb.private); 576 - 577 - io_req_set_res(req, io_fixup_rw_res(req, res), 0); 578 - } 579 - 580 572 io_req_io_end(req); 581 573 582 574 if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) ··· 582 592 struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); 583 593 struct io_kiocb *req = cmd_to_io_kiocb(rw); 584 594 585 - if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) { 586 - __io_complete_rw_common(req, res); 587 - io_req_set_res(req, io_fixup_rw_res(req, res), 0); 588 - } 595 + __io_complete_rw_common(req, res); 596 + io_req_set_res(req, io_fixup_rw_res(req, res), 0); 589 597 req->io_task_work.func = io_req_rw_complete; 590 598 __io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE); 591 599 }

+58

mm/filemap.c

··· 2366 2366 } 2367 2367 EXPORT_SYMBOL(filemap_get_folios_tag); 2368 2368 2369 + /** 2370 + * filemap_get_folios_dirty - Get a batch of dirty folios 2371 + * @mapping: The address_space to search 2372 + * @start: The starting folio index 2373 + * @end: The final folio index (inclusive) 2374 + * @fbatch: The batch to fill 2375 + * 2376 + * filemap_get_folios_dirty() works exactly like filemap_get_folios(), except 2377 + * the returned folios are presumed to be dirty or undergoing writeback. Dirty 2378 + * state is presumed because we don't block on folio lock nor want to miss 2379 + * folios. Callers that need to can recheck state upon locking the folio. 2380 + * 2381 + * This may not return all dirty folios if the batch gets filled up. 2382 + * 2383 + * Return: The number of folios found. 2384 + * Also update @start to be positioned for traversal of the next folio. 2385 + */ 2386 + unsigned filemap_get_folios_dirty(struct address_space *mapping, pgoff_t *start, 2387 + pgoff_t end, struct folio_batch *fbatch) 2388 + { 2389 + XA_STATE(xas, &mapping->i_pages, *start); 2390 + struct folio *folio; 2391 + 2392 + rcu_read_lock(); 2393 + while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { 2394 + if (xa_is_value(folio)) 2395 + continue; 2396 + if (folio_trylock(folio)) { 2397 + bool clean = !folio_test_dirty(folio) && 2398 + !folio_test_writeback(folio); 2399 + folio_unlock(folio); 2400 + if (clean) { 2401 + folio_put(folio); 2402 + continue; 2403 + } 2404 + } 2405 + if (!folio_batch_add(fbatch, folio)) { 2406 + unsigned long nr = folio_nr_pages(folio); 2407 + *start = folio->index + nr; 2408 + goto out; 2409 + } 2410 + } 2411 + /* 2412 + * We come here when there is no folio beyond @end. We take care to not 2413 + * overflow the index @start as it confuses some of the callers. This 2414 + * breaks the iteration when there is a folio at index -1 but that is 2415 + * already broke anyway. 2416 + */ 2417 + if (end == (pgoff_t)-1) 2418 + *start = (pgoff_t)-1; 2419 + else 2420 + *start = end + 1; 2421 + out: 2422 + rcu_read_unlock(); 2423 + 2424 + return folio_batch_count(fbatch); 2425 + } 2426 + 2369 2427 /* 2370 2428 * CD/DVDs are error prone. When a medium error occurs, the driver may fail 2371 2429 * a _large_ part of the i/o request. Imagine the worst scenario: