Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for-5.20/io_uring-buffered-writes-2022-07-29' of git://git.kernel.dk/linux-block

Pull io_uring buffered writes support from Jens Axboe:
"This contains support for buffered writes, specifically for XFS. btrfs
is in progress, will be coming in the next release.

io_uring does support buffered writes on any file type, but since the
buffered write path just always -EAGAIN (or -EOPNOTSUPP) any attempt
to do so if IOCB_NOWAIT is set, any buffered write will effectively be
handled by io-wq offload. This isn't very efficient, and we even have
specific code in io-wq to serialize buffered writes to the same inode
to avoid further inefficiencies with thread offload.

This is particularly sad since most buffered writes don't block, they
simply copy data to a page and dirty it. With this pull request, we
can handle buffered writes a lot more effiently.

If balance_dirty_pages() needs to block, we back off on writes as
indicated.

This improves buffered write support by 2-3x.

Jan Kara helped with the mm bits for this, and Stefan handled the
fs/iomap/xfs/io_uring parts of it"

* tag 'for-5.20/io_uring-buffered-writes-2022-07-29' of git://git.kernel.dk/linux-block:
mm: honor FGP_NOWAIT for page cache page allocation
xfs: Add async buffered write support
xfs: Specify lockmode when calling xfs_ilock_for_iomap()
io_uring: Add tracepoint for short writes
io_uring: fix issue with io_write() not always undoing sb_start_write()
io_uring: Add support for async buffered writes
fs: Add async write file modification handling.
fs: Split off inode_needs_update_time and __file_update_time
fs: add __remove_file_privs() with flags parameter
fs: add a FMODE_BUF_WASYNC flags for f_mode
iomap: Return -EAGAIN from iomap_write_iter()
iomap: Add async buffered write support
iomap: Add flags parameter to iomap_page_create()
mm: Add balance_dirty_pages_ratelimited_flags() function
mm: Move updates of dirty_exceeded into one place
mm: Move starting of background writeback into the main balancing loop

+324 -109
+125 -45
fs/inode.c
··· 2010 2010 return notify_change(mnt_userns, dentry, &newattrs, NULL); 2011 2011 } 2012 2012 2013 - /* 2014 - * Remove special file priviledges (suid, capabilities) when file is written 2015 - * to or truncated. 2016 - */ 2017 - int file_remove_privs(struct file *file) 2013 + static int __file_remove_privs(struct file *file, unsigned int flags) 2018 2014 { 2019 2015 struct dentry *dentry = file_dentry(file); 2020 2016 struct inode *inode = file_inode(file); 2017 + int error; 2021 2018 int kill; 2022 - int error = 0; 2023 2019 2024 - /* 2025 - * Fast path for nothing security related. 2026 - * As well for non-regular files, e.g. blkdev inodes. 2027 - * For example, blkdev_write_iter() might get here 2028 - * trying to remove privs which it is not allowed to. 2029 - */ 2030 2020 if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) 2031 2021 return 0; 2032 2022 2033 2023 kill = dentry_needs_remove_privs(dentry); 2034 - if (kill < 0) 2024 + if (kill <= 0) 2035 2025 return kill; 2036 - if (kill) 2037 - error = __remove_privs(file_mnt_user_ns(file), dentry, kill); 2026 + 2027 + if (flags & IOCB_NOWAIT) 2028 + return -EAGAIN; 2029 + 2030 + error = __remove_privs(file_mnt_user_ns(file), dentry, kill); 2038 2031 if (!error) 2039 2032 inode_has_no_xattr(inode); 2040 2033 2041 2034 return error; 2042 2035 } 2043 - EXPORT_SYMBOL(file_remove_privs); 2044 2036 2045 2037 /** 2046 - * file_update_time - update mtime and ctime time 2047 - * @file: file accessed 2038 + * file_remove_privs - remove special file privileges (suid, capabilities) 2039 + * @file: file to remove privileges from 2048 2040 * 2049 - * Update the mtime and ctime members of an inode and mark the inode 2050 - * for writeback. Note that this function is meant exclusively for 2051 - * usage in the file write path of filesystems, and filesystems may 2052 - * choose to explicitly ignore update via this function with the 2053 - * S_NOCMTIME inode flag, e.g. for network filesystem where these 2054 - * timestamps are handled by the server. This can return an error for 2055 - * file systems who need to allocate space in order to update an inode. 2041 + * When file is modified by a write or truncation ensure that special 2042 + * file privileges are removed. 2043 + * 2044 + * Return: 0 on success, negative errno on failure. 2056 2045 */ 2057 - 2058 - int file_update_time(struct file *file) 2046 + int file_remove_privs(struct file *file) 2059 2047 { 2060 - struct inode *inode = file_inode(file); 2061 - struct timespec64 now; 2048 + return __file_remove_privs(file, 0); 2049 + } 2050 + EXPORT_SYMBOL(file_remove_privs); 2051 + 2052 + static int inode_needs_update_time(struct inode *inode, struct timespec64 *now) 2053 + { 2062 2054 int sync_it = 0; 2063 - int ret; 2064 2055 2065 2056 /* First try to exhaust all avenues to not sync */ 2066 2057 if (IS_NOCMTIME(inode)) 2067 2058 return 0; 2068 2059 2069 - now = current_time(inode); 2070 - if (!timespec64_equal(&inode->i_mtime, &now)) 2060 + if (!timespec64_equal(&inode->i_mtime, now)) 2071 2061 sync_it = S_MTIME; 2072 2062 2073 - if (!timespec64_equal(&inode->i_ctime, &now)) 2063 + if (!timespec64_equal(&inode->i_ctime, now)) 2074 2064 sync_it |= S_CTIME; 2075 2065 2076 2066 if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) ··· 2069 2079 if (!sync_it) 2070 2080 return 0; 2071 2081 2072 - /* Finally allowed to write? Takes lock. */ 2073 - if (__mnt_want_write_file(file)) 2074 - return 0; 2082 + return sync_it; 2083 + } 2075 2084 2076 - ret = inode_update_time(inode, &now, sync_it); 2077 - __mnt_drop_write_file(file); 2085 + static int __file_update_time(struct file *file, struct timespec64 *now, 2086 + int sync_mode) 2087 + { 2088 + int ret = 0; 2089 + struct inode *inode = file_inode(file); 2090 + 2091 + /* try to update time settings */ 2092 + if (!__mnt_want_write_file(file)) { 2093 + ret = inode_update_time(inode, now, sync_mode); 2094 + __mnt_drop_write_file(file); 2095 + } 2078 2096 2079 2097 return ret; 2080 2098 } 2099 + 2100 + /** 2101 + * file_update_time - update mtime and ctime time 2102 + * @file: file accessed 2103 + * 2104 + * Update the mtime and ctime members of an inode and mark the inode for 2105 + * writeback. Note that this function is meant exclusively for usage in 2106 + * the file write path of filesystems, and filesystems may choose to 2107 + * explicitly ignore updates via this function with the _NOCMTIME inode 2108 + * flag, e.g. for network filesystem where these imestamps are handled 2109 + * by the server. This can return an error for file systems who need to 2110 + * allocate space in order to update an inode. 2111 + * 2112 + * Return: 0 on success, negative errno on failure. 2113 + */ 2114 + int file_update_time(struct file *file) 2115 + { 2116 + int ret; 2117 + struct inode *inode = file_inode(file); 2118 + struct timespec64 now = current_time(inode); 2119 + 2120 + ret = inode_needs_update_time(inode, &now); 2121 + if (ret <= 0) 2122 + return ret; 2123 + 2124 + return __file_update_time(file, &now, ret); 2125 + } 2081 2126 EXPORT_SYMBOL(file_update_time); 2082 2127 2083 - /* Caller must hold the file's inode lock */ 2084 - int file_modified(struct file *file) 2128 + /** 2129 + * file_modified_flags - handle mandated vfs changes when modifying a file 2130 + * @file: file that was modified 2131 + * @flags: kiocb flags 2132 + * 2133 + * When file has been modified ensure that special 2134 + * file privileges are removed and time settings are updated. 2135 + * 2136 + * If IOCB_NOWAIT is set, special file privileges will not be removed and 2137 + * time settings will not be updated. It will return -EAGAIN. 2138 + * 2139 + * Context: Caller must hold the file's inode lock. 2140 + * 2141 + * Return: 0 on success, negative errno on failure. 2142 + */ 2143 + static int file_modified_flags(struct file *file, int flags) 2085 2144 { 2086 - int err; 2145 + int ret; 2146 + struct inode *inode = file_inode(file); 2147 + struct timespec64 now = current_time(inode); 2087 2148 2088 2149 /* 2089 2150 * Clear the security bits if the process is not being run by root. 2090 2151 * This keeps people from modifying setuid and setgid binaries. 2091 2152 */ 2092 - err = file_remove_privs(file); 2093 - if (err) 2094 - return err; 2153 + ret = __file_remove_privs(file, flags); 2154 + if (ret) 2155 + return ret; 2095 2156 2096 2157 if (unlikely(file->f_mode & FMODE_NOCMTIME)) 2097 2158 return 0; 2098 2159 2099 - return file_update_time(file); 2160 + ret = inode_needs_update_time(inode, &now); 2161 + if (ret <= 0) 2162 + return ret; 2163 + if (flags & IOCB_NOWAIT) 2164 + return -EAGAIN; 2165 + 2166 + return __file_update_time(file, &now, ret); 2167 + } 2168 + 2169 + /** 2170 + * file_modified - handle mandated vfs changes when modifying a file 2171 + * @file: file that was modified 2172 + * 2173 + * When file has been modified ensure that special 2174 + * file privileges are removed and time settings are updated. 2175 + * 2176 + * Context: Caller must hold the file's inode lock. 2177 + * 2178 + * Return: 0 on success, negative errno on failure. 2179 + */ 2180 + int file_modified(struct file *file) 2181 + { 2182 + return file_modified_flags(file, 0); 2100 2183 } 2101 2184 EXPORT_SYMBOL(file_modified); 2185 + 2186 + /** 2187 + * kiocb_modified - handle mandated vfs changes when modifying a file 2188 + * @iocb: iocb that was modified 2189 + * 2190 + * When file has been modified ensure that special 2191 + * file privileges are removed and time settings are updated. 2192 + * 2193 + * Context: Caller must hold the file's inode lock. 2194 + * 2195 + * Return: 0 on success, negative errno on failure. 2196 + */ 2197 + int kiocb_modified(struct kiocb *iocb) 2198 + { 2199 + return file_modified_flags(iocb->ki_filp, iocb->ki_flags); 2200 + } 2201 + EXPORT_SYMBOL_GPL(kiocb_modified); 2102 2202 2103 2203 int inode_needs_sync(struct inode *inode) 2104 2204 {
+52 -15
fs/iomap/buffered-io.c
··· 44 44 static struct bio_set iomap_ioend_bioset; 45 45 46 46 static struct iomap_page * 47 - iomap_page_create(struct inode *inode, struct folio *folio) 47 + iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags) 48 48 { 49 49 struct iomap_page *iop = to_iomap_page(folio); 50 50 unsigned int nr_blocks = i_blocks_per_folio(inode, folio); 51 + gfp_t gfp; 51 52 52 53 if (iop || nr_blocks <= 1) 53 54 return iop; 54 55 56 + if (flags & IOMAP_NOWAIT) 57 + gfp = GFP_NOWAIT; 58 + else 59 + gfp = GFP_NOFS | __GFP_NOFAIL; 60 + 55 61 iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), 56 - GFP_NOFS | __GFP_NOFAIL); 57 - spin_lock_init(&iop->uptodate_lock); 58 - if (folio_test_uptodate(folio)) 59 - bitmap_fill(iop->uptodate, nr_blocks); 60 - folio_attach_private(folio, iop); 62 + gfp); 63 + if (iop) { 64 + spin_lock_init(&iop->uptodate_lock); 65 + if (folio_test_uptodate(folio)) 66 + bitmap_fill(iop->uptodate, nr_blocks); 67 + folio_attach_private(folio, iop); 68 + } 61 69 return iop; 62 70 } 63 71 ··· 234 226 if (WARN_ON_ONCE(size > iomap->length)) 235 227 return -EIO; 236 228 if (offset > 0) 237 - iop = iomap_page_create(iter->inode, folio); 229 + iop = iomap_page_create(iter->inode, folio, iter->flags); 238 230 else 239 231 iop = to_iomap_page(folio); 240 232 ··· 272 264 return iomap_read_inline_data(iter, folio); 273 265 274 266 /* zero post-eof blocks as the page may be mapped */ 275 - iop = iomap_page_create(iter->inode, folio); 267 + iop = iomap_page_create(iter->inode, folio, iter->flags); 276 268 iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); 277 269 if (plen == 0) 278 270 goto done; ··· 555 547 size_t len, struct folio *folio) 556 548 { 557 549 const struct iomap *srcmap = iomap_iter_srcmap(iter); 558 - struct iomap_page *iop = iomap_page_create(iter->inode, folio); 550 + struct iomap_page *iop; 559 551 loff_t block_size = i_blocksize(iter->inode); 560 552 loff_t block_start = round_down(pos, block_size); 561 553 loff_t block_end = round_up(pos + len, block_size); 554 + unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio); 562 555 size_t from = offset_in_folio(folio, pos), to = from + len; 563 556 size_t poff, plen; 564 557 565 558 if (folio_test_uptodate(folio)) 566 559 return 0; 567 560 folio_clear_error(folio); 561 + 562 + iop = iomap_page_create(iter->inode, folio, iter->flags); 563 + if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1) 564 + return -EAGAIN; 568 565 569 566 do { 570 567 iomap_adjust_read_range(iter->inode, folio, &block_start, ··· 587 574 return -EIO; 588 575 folio_zero_segments(folio, poff, from, to, poff + plen); 589 576 } else { 590 - int status = iomap_read_folio_sync(block_start, folio, 577 + int status; 578 + 579 + if (iter->flags & IOMAP_NOWAIT) 580 + return -EAGAIN; 581 + 582 + status = iomap_read_folio_sync(block_start, folio, 591 583 poff, plen, srcmap); 592 584 if (status) 593 585 return status; ··· 621 603 unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; 622 604 int status = 0; 623 605 606 + if (iter->flags & IOMAP_NOWAIT) 607 + fgp |= FGP_NOWAIT; 608 + 624 609 BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); 625 610 if (srcmap != &iter->iomap) 626 611 BUG_ON(pos + len > srcmap->offset + srcmap->length); ··· 643 622 folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, 644 623 fgp, mapping_gfp_mask(iter->inode->i_mapping)); 645 624 if (!folio) { 646 - status = -ENOMEM; 625 + status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM; 647 626 goto out_no_page; 648 627 } 649 628 if (pos + len > folio_pos(folio) + folio_size(folio)) ··· 761 740 loff_t pos = iter->pos; 762 741 ssize_t written = 0; 763 742 long status = 0; 743 + struct address_space *mapping = iter->inode->i_mapping; 744 + unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; 764 745 765 746 do { 766 747 struct folio *folio; ··· 775 752 bytes = min_t(unsigned long, PAGE_SIZE - offset, 776 753 iov_iter_count(i)); 777 754 again: 755 + status = balance_dirty_pages_ratelimited_flags(mapping, 756 + bdp_flags); 757 + if (unlikely(status)) 758 + break; 759 + 778 760 if (bytes > length) 779 761 bytes = length; 780 762 ··· 788 760 * Otherwise there's a nasty deadlock on copying from the 789 761 * same page as we're writing to, without it being marked 790 762 * up-to-date. 763 + * 764 + * For async buffered writes the assumption is that the user 765 + * page has already been faulted in. This can be optimized by 766 + * faulting the user page. 791 767 */ 792 768 if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { 793 769 status = -EFAULT; ··· 803 771 break; 804 772 805 773 page = folio_file_page(folio, pos >> PAGE_SHIFT); 806 - if (mapping_writably_mapped(iter->inode->i_mapping)) 774 + if (mapping_writably_mapped(mapping)) 807 775 flush_dcache_page(page); 808 776 809 777 copied = copy_page_from_iter_atomic(page, offset, bytes, i); ··· 828 796 pos += status; 829 797 written += status; 830 798 length -= status; 831 - 832 - balance_dirty_pages_ratelimited(iter->inode->i_mapping); 833 799 } while (iov_iter_count(i) && length); 834 800 801 + if (status == -EAGAIN) { 802 + iov_iter_revert(i, written); 803 + return -EAGAIN; 804 + } 835 805 return written ? written : status; 836 806 } 837 807 ··· 848 814 .flags = IOMAP_WRITE, 849 815 }; 850 816 int ret; 817 + 818 + if (iocb->ki_flags & IOCB_NOWAIT) 819 + iter.flags |= IOMAP_NOWAIT; 851 820 852 821 while ((ret = iomap_iter(&iter, ops)) > 0) 853 822 iter.processed = iomap_write_iter(&iter, i); ··· 1366 1329 struct writeback_control *wbc, struct inode *inode, 1367 1330 struct folio *folio, u64 end_pos) 1368 1331 { 1369 - struct iomap_page *iop = iomap_page_create(inode, folio); 1332 + struct iomap_page *iop = iomap_page_create(inode, folio, 0); 1370 1333 struct iomap_ioend *ioend, *next; 1371 1334 unsigned len = i_blocksize(inode); 1372 1335 unsigned nblocks = i_blocks_per_folio(inode, folio);
+3 -1
fs/read_write.c
··· 1663 1663 if (iocb->ki_flags & IOCB_APPEND) 1664 1664 iocb->ki_pos = i_size_read(inode); 1665 1665 1666 - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) 1666 + if ((iocb->ki_flags & IOCB_NOWAIT) && 1667 + !((iocb->ki_flags & IOCB_DIRECT) || 1668 + (file->f_mode & FMODE_BUF_WASYNC))) 1667 1669 return -EINVAL; 1668 1670 1669 1671 return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
+5 -6
fs/xfs/xfs_file.c
··· 410 410 spin_unlock(&ip->i_flags_lock); 411 411 412 412 out: 413 - return file_modified(file); 413 + return kiocb_modified(iocb); 414 414 } 415 415 416 416 static int ··· 700 700 bool cleared_space = false; 701 701 unsigned int iolock; 702 702 703 - if (iocb->ki_flags & IOCB_NOWAIT) 704 - return -EOPNOTSUPP; 705 - 706 703 write_retry: 707 704 iolock = XFS_IOLOCK_EXCL; 708 - xfs_ilock(ip, iolock); 705 + ret = xfs_ilock_iocb(iocb, iolock); 706 + if (ret) 707 + return ret; 709 708 710 709 ret = xfs_file_write_checks(iocb, from, &iolock); 711 710 if (ret) ··· 1164 1165 { 1165 1166 if (xfs_is_shutdown(XFS_M(inode->i_sb))) 1166 1167 return -EIO; 1167 - file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; 1168 + file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC; 1168 1169 return generic_file_open(inode, file); 1169 1170 } 1170 1171
+7 -4
fs/xfs/xfs_iomap.c
··· 664 664 unsigned flags, 665 665 unsigned *lockmode) 666 666 { 667 - unsigned mode = XFS_ILOCK_SHARED; 667 + unsigned int mode = *lockmode; 668 668 bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO); 669 669 670 670 /* ··· 742 742 int nimaps = 1, error = 0; 743 743 bool shared = false; 744 744 u16 iomap_flags = 0; 745 - unsigned lockmode; 745 + unsigned int lockmode = XFS_ILOCK_SHARED; 746 746 747 747 ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); 748 748 ··· 886 886 bool eof = false, cow_eof = false, shared = false; 887 887 int allocfork = XFS_DATA_FORK; 888 888 int error = 0; 889 + unsigned int lockmode = XFS_ILOCK_EXCL; 889 890 890 891 if (xfs_is_shutdown(mp)) 891 892 return -EIO; ··· 898 897 899 898 ASSERT(!XFS_IS_REALTIME_INODE(ip)); 900 899 901 - xfs_ilock(ip, XFS_ILOCK_EXCL); 900 + error = xfs_ilock_for_iomap(ip, flags, &lockmode); 901 + if (error) 902 + return error; 902 903 903 904 if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || 904 905 XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { ··· 1175 1172 xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); 1176 1173 int nimaps = 1, error = 0; 1177 1174 bool shared = false; 1178 - unsigned lockmode; 1175 + unsigned int lockmode = XFS_ILOCK_SHARED; 1179 1176 1180 1177 ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); 1181 1178
+4
include/linux/fs.h
··· 180 180 /* File supports async buffered reads */ 181 181 #define FMODE_BUF_RASYNC ((__force fmode_t)0x40000000) 182 182 183 + /* File supports async nowait buffered writes */ 184 + #define FMODE_BUF_WASYNC ((__force fmode_t)0x80000000) 185 + 183 186 /* 184 187 * Attribute flags. These should be or-ed together to figure out what 185 188 * has been changed! ··· 2518 2515 } 2519 2516 2520 2517 extern int file_modified(struct file *file); 2518 + int kiocb_modified(struct kiocb *iocb); 2521 2519 2522 2520 int sync_inode_metadata(struct inode *inode, int wait); 2523 2521
+7
include/linux/writeback.h
··· 364 364 unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); 365 365 366 366 void wb_update_bandwidth(struct bdi_writeback *wb); 367 + 368 + /* Invoke balance dirty pages in async mode. */ 369 + #define BDP_ASYNC 0x0001 370 + 367 371 void balance_dirty_pages_ratelimited(struct address_space *mapping); 372 + int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, 373 + unsigned int flags); 374 + 368 375 bool wb_over_bg_thresh(struct bdi_writeback *wb); 369 376 370 377 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
+25
include/trace/events/io_uring.h
··· 630 630 __entry->tctx, __entry->count, __entry->loops) 631 631 ); 632 632 633 + TRACE_EVENT(io_uring_short_write, 634 + 635 + TP_PROTO(void *ctx, u64 fpos, u64 wanted, u64 got), 636 + 637 + TP_ARGS(ctx, fpos, wanted, got), 638 + 639 + TP_STRUCT__entry( 640 + __field(void *, ctx) 641 + __field(u64, fpos) 642 + __field(u64, wanted) 643 + __field(u64, got) 644 + ), 645 + 646 + TP_fast_assign( 647 + __entry->ctx = ctx; 648 + __entry->fpos = fpos; 649 + __entry->wanted = wanted; 650 + __entry->got = got; 651 + ), 652 + 653 + TP_printk("ring %p, fpos %lld, wanted %lld, got %lld", 654 + __entry->ctx, __entry->fpos, 655 + __entry->wanted, __entry->got) 656 + ); 657 + 633 658 #endif /* _TRACE_IO_URING_H */ 634 659 635 660 /* This part must be outside protection */
+35 -6
io_uring/rw.c
··· 641 641 return -EINVAL; 642 642 } 643 643 644 - static bool need_read_all(struct io_kiocb *req) 644 + static bool need_complete_io(struct io_kiocb *req) 645 645 { 646 646 return req->flags & REQ_F_ISREG || 647 647 S_ISBLK(file_inode(req->file)->i_mode); ··· 775 775 kfree(iovec); 776 776 return IOU_ISSUE_SKIP_COMPLETE; 777 777 } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || 778 - (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { 778 + (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) { 779 779 /* read all, failed, already did sync or don't want to retry */ 780 780 goto done; 781 781 } ··· 870 870 if (unlikely(!io_file_supports_nowait(req))) 871 871 goto copy_iov; 872 872 873 - /* file path doesn't support NOWAIT for non-direct_IO */ 874 - if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && 875 - (req->flags & REQ_F_ISREG)) 873 + /* File path supports NOWAIT for non-direct_IO only for block devices. */ 874 + if (!(kiocb->ki_flags & IOCB_DIRECT) && 875 + !(kiocb->ki_filp->f_mode & FMODE_BUF_WASYNC) && 876 + (req->flags & REQ_F_ISREG)) 876 877 goto copy_iov; 877 878 878 879 kiocb->ki_flags |= IOCB_NOWAIT; ··· 929 928 /* IOPOLL retry should happen for io-wq threads */ 930 929 if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) 931 930 goto copy_iov; 931 + 932 + if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) { 933 + struct io_async_rw *rw; 934 + 935 + trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2, 936 + req->cqe.res, ret2); 937 + 938 + /* This is a partial write. The file pos has already been 939 + * updated, setup the async struct to complete the request 940 + * in the worker. Also update bytes_done to account for 941 + * the bytes already written. 942 + */ 943 + iov_iter_save_state(&s->iter, &s->iter_state); 944 + ret = io_setup_async_rw(req, iovec, s, true); 945 + 946 + rw = req->async_data; 947 + if (rw) 948 + rw->bytes_done += ret2; 949 + 950 + if (kiocb->ki_flags & IOCB_WRITE) 951 + kiocb_end_write(req); 952 + return ret ? ret : -EAGAIN; 953 + } 932 954 done: 933 955 ret = kiocb_done(req, ret2, issue_flags); 934 956 } else { 935 957 copy_iov: 936 958 iov_iter_restore(&s->iter, &s->iter_state); 937 959 ret = io_setup_async_rw(req, iovec, s, false); 938 - return ret ?: -EAGAIN; 960 + if (!ret) { 961 + if (kiocb->ki_flags & IOCB_WRITE) 962 + kiocb_end_write(req); 963 + return -EAGAIN; 964 + } 965 + return ret; 939 966 } 940 967 /* it's reportedly faster than delegating the null check to kfree() */ 941 968 if (iovec)
+4
mm/filemap.c
··· 1988 1988 gfp |= __GFP_WRITE; 1989 1989 if (fgp_flags & FGP_NOFS) 1990 1990 gfp &= ~__GFP_FS; 1991 + if (fgp_flags & FGP_NOWAIT) { 1992 + gfp &= ~GFP_KERNEL; 1993 + gfp |= GFP_NOWAIT | __GFP_NOWARN; 1994 + } 1991 1995 1992 1996 folio = filemap_alloc_folio(gfp, 0); 1993 1997 if (!folio)
+57 -32
mm/page-writeback.c
··· 1554 1554 * If we're over `background_thresh' then the writeback threads are woken to 1555 1555 * perform some writeout. 1556 1556 */ 1557 - static void balance_dirty_pages(struct bdi_writeback *wb, 1558 - unsigned long pages_dirtied) 1557 + static int balance_dirty_pages(struct bdi_writeback *wb, 1558 + unsigned long pages_dirtied, unsigned int flags) 1559 1559 { 1560 1560 struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; 1561 1561 struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; ··· 1575 1575 struct backing_dev_info *bdi = wb->bdi; 1576 1576 bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; 1577 1577 unsigned long start_time = jiffies; 1578 + int ret = 0; 1578 1579 1579 1580 for (;;) { 1580 1581 unsigned long now = jiffies; ··· 1629 1628 } 1630 1629 1631 1630 /* 1631 + * In laptop mode, we wait until hitting the higher threshold 1632 + * before starting background writeout, and then write out all 1633 + * the way down to the lower threshold. So slow writers cause 1634 + * minimal disk activity. 1635 + * 1636 + * In normal mode, we start background writeout at the lower 1637 + * background_thresh, to keep the amount of dirty memory low. 1638 + */ 1639 + if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh && 1640 + !writeback_in_progress(wb)) 1641 + wb_start_background_writeback(wb); 1642 + 1643 + /* 1632 1644 * Throttle it only when the background writeback cannot 1633 1645 * catch-up. This avoids (excessively) small writeouts 1634 1646 * when the wb limits are ramping up in case of !strictlimit. ··· 1671 1657 break; 1672 1658 } 1673 1659 1660 + /* Start writeback even when in laptop mode */ 1674 1661 if (unlikely(!writeback_in_progress(wb))) 1675 1662 wb_start_background_writeback(wb); 1676 1663 ··· 1730 1715 sdtc = mdtc; 1731 1716 } 1732 1717 1733 - if (dirty_exceeded && !wb->dirty_exceeded) 1734 - wb->dirty_exceeded = 1; 1718 + if (dirty_exceeded != wb->dirty_exceeded) 1719 + wb->dirty_exceeded = dirty_exceeded; 1735 1720 1736 1721 if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + 1737 1722 BANDWIDTH_INTERVAL)) ··· 1804 1789 period, 1805 1790 pause, 1806 1791 start_time); 1792 + if (flags & BDP_ASYNC) { 1793 + ret = -EAGAIN; 1794 + break; 1795 + } 1807 1796 __set_current_state(TASK_KILLABLE); 1808 1797 wb->dirty_sleep = now; 1809 1798 io_schedule_timeout(pause); ··· 1839 1820 if (fatal_signal_pending(current)) 1840 1821 break; 1841 1822 } 1842 - 1843 - if (!dirty_exceeded && wb->dirty_exceeded) 1844 - wb->dirty_exceeded = 0; 1845 - 1846 - if (writeback_in_progress(wb)) 1847 - return; 1848 - 1849 - /* 1850 - * In laptop mode, we wait until hitting the higher threshold before 1851 - * starting background writeout, and then write out all the way down 1852 - * to the lower threshold. So slow writers cause minimal disk activity. 1853 - * 1854 - * In normal mode, we start background writeout at the lower 1855 - * background_thresh, to keep the amount of dirty memory low. 1856 - */ 1857 - if (laptop_mode) 1858 - return; 1859 - 1860 - if (nr_reclaimable > gdtc->bg_thresh) 1861 - wb_start_background_writeback(wb); 1823 + return ret; 1862 1824 } 1863 1825 1864 1826 static DEFINE_PER_CPU(int, bdp_ratelimits); ··· 1861 1861 DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; 1862 1862 1863 1863 /** 1864 - * balance_dirty_pages_ratelimited - balance dirty memory state 1865 - * @mapping: address_space which was dirtied 1864 + * balance_dirty_pages_ratelimited_flags - Balance dirty memory state. 1865 + * @mapping: address_space which was dirtied. 1866 + * @flags: BDP flags. 1866 1867 * 1867 1868 * Processes which are dirtying memory should call in here once for each page 1868 1869 * which was newly dirtied. The function will periodically check the system's 1869 1870 * dirty state and will initiate writeback if needed. 1870 1871 * 1871 - * Once we're over the dirty memory limit we decrease the ratelimiting 1872 - * by a lot, to prevent individual processes from overshooting the limit 1873 - * by (ratelimit_pages) each. 1872 + * See balance_dirty_pages_ratelimited() for details. 1873 + * 1874 + * Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to 1875 + * indicate that memory is out of balance and the caller must wait 1876 + * for I/O to complete. Otherwise, it will return 0 to indicate 1877 + * that either memory was already in balance, or it was able to sleep 1878 + * until the amount of dirty memory returned to balance. 1874 1879 */ 1875 - void balance_dirty_pages_ratelimited(struct address_space *mapping) 1880 + int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, 1881 + unsigned int flags) 1876 1882 { 1877 1883 struct inode *inode = mapping->host; 1878 1884 struct backing_dev_info *bdi = inode_to_bdi(inode); 1879 1885 struct bdi_writeback *wb = NULL; 1880 1886 int ratelimit; 1887 + int ret = 0; 1881 1888 int *p; 1882 1889 1883 1890 if (!(bdi->capabilities & BDI_CAP_WRITEBACK)) 1884 - return; 1891 + return ret; 1885 1892 1886 1893 if (inode_cgwb_enabled(inode)) 1887 1894 wb = wb_get_create_current(bdi, GFP_KERNEL); ··· 1928 1921 preempt_enable(); 1929 1922 1930 1923 if (unlikely(current->nr_dirtied >= ratelimit)) 1931 - balance_dirty_pages(wb, current->nr_dirtied); 1924 + ret = balance_dirty_pages(wb, current->nr_dirtied, flags); 1932 1925 1933 1926 wb_put(wb); 1927 + return ret; 1928 + } 1929 + 1930 + /** 1931 + * balance_dirty_pages_ratelimited - balance dirty memory state. 1932 + * @mapping: address_space which was dirtied. 1933 + * 1934 + * Processes which are dirtying memory should call in here once for each page 1935 + * which was newly dirtied. The function will periodically check the system's 1936 + * dirty state and will initiate writeback if needed. 1937 + * 1938 + * Once we're over the dirty memory limit we decrease the ratelimiting 1939 + * by a lot, to prevent individual processes from overshooting the limit 1940 + * by (ratelimit_pages) each. 1941 + */ 1942 + void balance_dirty_pages_ratelimited(struct address_space *mapping) 1943 + { 1944 + balance_dirty_pages_ratelimited_flags(mapping, 0); 1934 1945 } 1935 1946 EXPORT_SYMBOL(balance_dirty_pages_ratelimited); 1936 1947