Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext2: Move direct-io to use iomap

This patch converts ext2 direct-io path to iomap interface.
- This also takes care of DIO_SKIP_HOLES part in which we return -ENOTBLK
from ext2_iomap_begin(), in case if the write is done on a hole.
- This fallbacks to buffered-io in case of DIO_SKIP_HOLES or in case of
a partial write or if any error is detected in ext2_iomap_end().
We try to return -ENOTBLK in such cases.
- For any unaligned or extending DIO writes, we pass
IOMAP_DIO_FORCE_WAIT flag to ensure synchronous writes.
- For extending writes we set IOMAP_F_DIRTY in ext2_iomap_begin because
otherwise with dsync writes on devices that support FUA, generic_write_sync
won't be called and we might miss inode metadata updates.
- Since ext2 already now uses _nolock vartiant of sync write. Hence
there is no inode lock problem with iomap in this patch.
- ext2_iomap_ops are now being shared by DIO, DAX & fiemap path

Tested-by: Disha Goel <disgoel@linux.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Message-Id: <610b672a52f2a7ff6dc550fd14d0f995806232a5.1682069716.git.ritesh.list@gmail.com>

authored by

Ritesh Harjani (IBM) and committed by
Jan Kara
fb5de435 d0530704

+150 -19
+1
fs/ext2/ext2.h
··· 754 754 extern struct inode *ext2_iget (struct super_block *, unsigned long); 755 755 extern int ext2_write_inode (struct inode *, struct writeback_control *); 756 756 extern void ext2_evict_inode(struct inode *); 757 + void ext2_write_failed(struct address_space *mapping, loff_t to); 757 758 extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); 758 759 extern int ext2_setattr (struct mnt_idmap *, struct dentry *, struct iattr *); 759 760 extern int ext2_getattr (struct mnt_idmap *, const struct path *,
+115
fs/ext2/file.c
··· 162 162 return ret; 163 163 } 164 164 165 + static ssize_t ext2_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) 166 + { 167 + struct file *file = iocb->ki_filp; 168 + struct inode *inode = file->f_mapping->host; 169 + ssize_t ret; 170 + 171 + inode_lock_shared(inode); 172 + ret = iomap_dio_rw(iocb, to, &ext2_iomap_ops, NULL, 0, NULL, 0); 173 + inode_unlock_shared(inode); 174 + 175 + return ret; 176 + } 177 + 178 + static int ext2_dio_write_end_io(struct kiocb *iocb, ssize_t size, 179 + int error, unsigned int flags) 180 + { 181 + loff_t pos = iocb->ki_pos; 182 + struct inode *inode = file_inode(iocb->ki_filp); 183 + 184 + if (error) 185 + goto out; 186 + 187 + /* 188 + * If we are extending the file, we have to update i_size here before 189 + * page cache gets invalidated in iomap_dio_rw(). This prevents racing 190 + * buffered reads from zeroing out too much from page cache pages. 191 + * Note that all extending writes always happens synchronously with 192 + * inode lock held by ext2_dio_write_iter(). So it is safe to update 193 + * inode size here for extending file writes. 194 + */ 195 + pos += size; 196 + if (pos > i_size_read(inode)) { 197 + i_size_write(inode, pos); 198 + mark_inode_dirty(inode); 199 + } 200 + out: 201 + return error; 202 + } 203 + 204 + static const struct iomap_dio_ops ext2_dio_write_ops = { 205 + .end_io = ext2_dio_write_end_io, 206 + }; 207 + 208 + static ssize_t ext2_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) 209 + { 210 + struct file *file = iocb->ki_filp; 211 + struct inode *inode = file->f_mapping->host; 212 + ssize_t ret; 213 + unsigned int flags = 0; 214 + unsigned long blocksize = inode->i_sb->s_blocksize; 215 + loff_t offset = iocb->ki_pos; 216 + loff_t count = iov_iter_count(from); 217 + 218 + inode_lock(inode); 219 + ret = generic_write_checks(iocb, from); 220 + if (ret <= 0) 221 + goto out_unlock; 222 + 223 + ret = kiocb_modified(iocb); 224 + if (ret) 225 + goto out_unlock; 226 + 227 + /* use IOMAP_DIO_FORCE_WAIT for unaligned or extending writes */ 228 + if (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode) || 229 + (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(from), blocksize))) 230 + flags |= IOMAP_DIO_FORCE_WAIT; 231 + 232 + ret = iomap_dio_rw(iocb, from, &ext2_iomap_ops, &ext2_dio_write_ops, 233 + flags, NULL, 0); 234 + 235 + /* ENOTBLK is magic return value for fallback to buffered-io */ 236 + if (ret == -ENOTBLK) 237 + ret = 0; 238 + 239 + if (ret < 0 && ret != -EIOCBQUEUED) 240 + ext2_write_failed(inode->i_mapping, offset + count); 241 + 242 + /* handle case for partial write and for fallback to buffered write */ 243 + if (ret >= 0 && iov_iter_count(from)) { 244 + loff_t pos, endbyte; 245 + ssize_t status; 246 + int ret2; 247 + 248 + iocb->ki_flags &= ~IOCB_DIRECT; 249 + pos = iocb->ki_pos; 250 + status = generic_perform_write(iocb, from); 251 + if (unlikely(status < 0)) { 252 + ret = status; 253 + goto out_unlock; 254 + } 255 + 256 + iocb->ki_pos += status; 257 + ret += status; 258 + endbyte = pos + status - 1; 259 + ret2 = filemap_write_and_wait_range(inode->i_mapping, pos, 260 + endbyte); 261 + if (!ret2) 262 + invalidate_mapping_pages(inode->i_mapping, 263 + pos >> PAGE_SHIFT, 264 + endbyte >> PAGE_SHIFT); 265 + if (ret > 0) 266 + generic_write_sync(iocb, ret); 267 + } 268 + 269 + out_unlock: 270 + inode_unlock(inode); 271 + return ret; 272 + } 273 + 165 274 static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 166 275 { 167 276 #ifdef CONFIG_FS_DAX 168 277 if (IS_DAX(iocb->ki_filp->f_mapping->host)) 169 278 return ext2_dax_read_iter(iocb, to); 170 279 #endif 280 + if (iocb->ki_flags & IOCB_DIRECT) 281 + return ext2_dio_read_iter(iocb, to); 282 + 171 283 return generic_file_read_iter(iocb, to); 172 284 } 173 285 ··· 289 177 if (IS_DAX(iocb->ki_filp->f_mapping->host)) 290 178 return ext2_dax_write_iter(iocb, from); 291 179 #endif 180 + if (iocb->ki_flags & IOCB_DIRECT) 181 + return ext2_dio_write_iter(iocb, from); 182 + 292 183 return generic_file_write_iter(iocb, from); 293 184 } 294 185
+34 -19
fs/ext2/inode.c
··· 56 56 57 57 static void ext2_truncate_blocks(struct inode *inode, loff_t offset); 58 58 59 - static void ext2_write_failed(struct address_space *mapping, loff_t to) 59 + void ext2_write_failed(struct address_space *mapping, loff_t to) 60 60 { 61 61 struct inode *inode = mapping->host; 62 62 ··· 809 809 bool new = false, boundary = false; 810 810 u32 bno; 811 811 int ret; 812 + bool create = flags & IOMAP_WRITE; 813 + 814 + /* 815 + * For writes that could fill holes inside i_size on a 816 + * DIO_SKIP_HOLES filesystem we forbid block creations: only 817 + * overwrites are permitted. 818 + */ 819 + if ((flags & IOMAP_DIRECT) && 820 + (first_block << blkbits) < i_size_read(inode)) 821 + create = 0; 822 + 823 + /* 824 + * Writes that span EOF might trigger an IO size update on completion, 825 + * so consider them to be dirty for the purposes of O_DSYNC even if 826 + * there is no other metadata changes pending or have been made here. 827 + */ 828 + if ((flags & IOMAP_WRITE) && offset + length > i_size_read(inode)) 829 + iomap->flags |= IOMAP_F_DIRTY; 812 830 813 831 ret = ext2_get_blocks(inode, first_block, max_blocks, 814 - &bno, &new, &boundary, flags & IOMAP_WRITE); 832 + &bno, &new, &boundary, create); 815 833 if (ret < 0) 816 834 return ret; 817 835 ··· 841 823 iomap->bdev = inode->i_sb->s_bdev; 842 824 843 825 if (ret == 0) { 826 + /* 827 + * Switch to buffered-io for writing to holes in a non-extent 828 + * based filesystem to avoid stale data exposure problem. 829 + */ 830 + if (!create && (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT)) 831 + return -ENOTBLK; 844 832 iomap->type = IOMAP_HOLE; 845 833 iomap->addr = IOMAP_NULL_ADDR; 846 834 iomap->length = 1 << blkbits; ··· 868 844 ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, 869 845 ssize_t written, unsigned flags, struct iomap *iomap) 870 846 { 847 + /* 848 + * Switch to buffered-io in case of any error. 849 + * Blocks allocated can be used by the buffered-io path. 850 + */ 851 + if ((flags & IOMAP_DIRECT) && (flags & IOMAP_WRITE) && written == 0) 852 + return -ENOTBLK; 853 + 871 854 if (iomap->type == IOMAP_MAPPED && 872 855 written < length && 873 856 (flags & IOMAP_WRITE)) ··· 939 908 return generic_block_bmap(mapping,block,ext2_get_block); 940 909 } 941 910 942 - static ssize_t 943 - ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) 944 - { 945 - struct file *file = iocb->ki_filp; 946 - struct address_space *mapping = file->f_mapping; 947 - struct inode *inode = mapping->host; 948 - size_t count = iov_iter_count(iter); 949 - loff_t offset = iocb->ki_pos; 950 - ssize_t ret; 951 - 952 - ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); 953 - if (ret < 0 && iov_iter_rw(iter) == WRITE) 954 - ext2_write_failed(mapping, offset + count); 955 - return ret; 956 - } 957 - 958 911 static int 959 912 ext2_writepages(struct address_space *mapping, struct writeback_control *wbc) 960 913 { ··· 961 946 .write_begin = ext2_write_begin, 962 947 .write_end = ext2_write_end, 963 948 .bmap = ext2_bmap, 964 - .direct_IO = ext2_direct_IO, 949 + .direct_IO = noop_direct_IO, 965 950 .writepages = ext2_writepages, 966 951 .migrate_folio = buffer_migrate_folio, 967 952 .is_partially_uptodate = block_is_partially_uptodate,