Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'statx-dioalign-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux

Pull STATX_DIOALIGN support from Eric Biggers:
"Make statx() support reporting direct I/O (DIO) alignment information.

This provides a generic interface for userspace programs to determine
whether a file supports DIO, and if so with what alignment
restrictions. Specifically, STATX_DIOALIGN works on block devices, and
on regular files when their containing filesystem has implemented
support.

An interface like this has been requested for years, since the
conditions for when DIO is supported in Linux have gotten increasingly
complex over time. Today, DIO support and alignment requirements can
be affected by various filesystem features such as multi-device
support, data journalling, inline data, encryption, verity,
compression, checkpoint disabling, log-structured mode, etc.

Further complicating things, Linux v6.0 relaxed the traditional rule
of DIO needing to be aligned to the block device's logical block size;
now user buffers (but not file offsets) only need to be aligned to the
DMA alignment.

The approach of uplifting the XFS specific ioctl XFS_IOC_DIOINFO was
discarded in favor of creating a clean new interface with statx().

For more information, see the individual commits and the man page
update[1]"

Link: https://lore.kernel.org/r/20220722074229.148925-1-ebiggers@kernel.org [1]

* tag 'statx-dioalign-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux:
xfs: support STATX_DIOALIGN
f2fs: support STATX_DIOALIGN
f2fs: simplify f2fs_force_buffered_io()
f2fs: move f2fs_force_buffered_io() into file.c
ext4: support STATX_DIOALIGN
fscrypt: change fscrypt_dio_supported() to prepare for STATX_DIOALIGN
vfs: support STATX_DIOALIGN on block devices
statx: add direct I/O alignment information

+188 -83
+23
block/bdev.c
··· 26 26 #include <linux/namei.h> 27 27 #include <linux/part_stat.h> 28 28 #include <linux/uaccess.h> 29 + #include <linux/stat.h> 29 30 #include "../fs/internal.h" 30 31 #include "blk.h" 31 32 ··· 1069 1068 } 1070 1069 spin_unlock(&blockdev_superblock->s_inode_list_lock); 1071 1070 iput(old_inode); 1071 + } 1072 + 1073 + /* 1074 + * Handle STATX_DIOALIGN for block devices. 1075 + * 1076 + * Note that the inode passed to this is the inode of a block device node file, 1077 + * not the block device's internal inode. Therefore it is *not* valid to use 1078 + * I_BDEV() here; the block device has to be looked up by i_rdev instead. 1079 + */ 1080 + void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) 1081 + { 1082 + struct block_device *bdev; 1083 + 1084 + bdev = blkdev_get_no_open(inode->i_rdev); 1085 + if (!bdev) 1086 + return; 1087 + 1088 + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; 1089 + stat->dio_offset_align = bdev_logical_block_size(bdev); 1090 + stat->result_mask |= STATX_DIOALIGN; 1091 + 1092 + blkdev_put_no_open(bdev); 1072 1093 }
+24 -25
fs/crypto/inline_crypt.c
··· 396 396 EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh); 397 397 398 398 /** 399 - * fscrypt_dio_supported() - check whether a DIO (direct I/O) request is 400 - * supported as far as encryption is concerned 401 - * @iocb: the file and position the I/O is targeting 402 - * @iter: the I/O data segment(s) 399 + * fscrypt_dio_supported() - check whether DIO (direct I/O) is supported on an 400 + * inode, as far as encryption is concerned 401 + * @inode: the inode in question 403 402 * 404 403 * Return: %true if there are no encryption constraints that prevent DIO from 405 404 * being supported; %false if DIO is unsupported. (Note that in the 406 405 * %true case, the filesystem might have other, non-encryption-related 407 - * constraints that prevent DIO from actually being supported.) 406 + * constraints that prevent DIO from actually being supported. Also, on 407 + * encrypted files the filesystem is still responsible for only allowing 408 + * DIO when requests are filesystem-block-aligned.) 408 409 */ 409 - bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter) 410 + bool fscrypt_dio_supported(struct inode *inode) 410 411 { 411 - const struct inode *inode = file_inode(iocb->ki_filp); 412 - const unsigned int blocksize = i_blocksize(inode); 412 + int err; 413 413 414 414 /* If the file is unencrypted, no veto from us. */ 415 415 if (!fscrypt_needs_contents_encryption(inode)) 416 416 return true; 417 417 418 - /* We only support DIO with inline crypto, not fs-layer crypto. */ 419 - if (!fscrypt_inode_uses_inline_crypto(inode)) 420 - return false; 421 - 422 418 /* 423 - * Since the granularity of encryption is filesystem blocks, the file 424 - * position and total I/O length must be aligned to the filesystem block 425 - * size -- not just to the block device's logical block size as is 426 - * traditionally the case for DIO on many filesystems. 419 + * We only support DIO with inline crypto, not fs-layer crypto. 427 420 * 428 - * We require that the user-provided memory buffers be filesystem block 429 - * aligned too. It is simpler to have a single alignment value required 430 - * for all properties of the I/O, as is normally the case for DIO. 431 - * Also, allowing less aligned buffers would imply that data units could 432 - * cross bvecs, which would greatly complicate the I/O stack, which 433 - * assumes that bios can be split at any bvec boundary. 421 + * To determine whether the inode is using inline crypto, we have to set 422 + * up the key if it wasn't already done. This is because in the current 423 + * design of fscrypt, the decision of whether to use inline crypto or 424 + * not isn't made until the inode's encryption key is being set up. In 425 + * the DIO read/write case, the key will always be set up already, since 426 + * the file will be open. But in the case of statx(), the key might not 427 + * be set up yet, as the file might not have been opened yet. 434 428 */ 435 - if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), blocksize)) 429 + err = fscrypt_require_key(inode); 430 + if (err) { 431 + /* 432 + * Key unavailable or couldn't be set up. This edge case isn't 433 + * worth worrying about; just report that DIO is unsupported. 434 + */ 436 435 return false; 437 - 438 - return true; 436 + } 437 + return fscrypt_inode_uses_inline_crypto(inode); 439 438 } 440 439 EXPORT_SYMBOL_GPL(fscrypt_dio_supported); 441 440
+1
fs/ext4/ext4.h
··· 2977 2977 extern int ext4_write_inode(struct inode *, struct writeback_control *); 2978 2978 extern int ext4_setattr(struct user_namespace *, struct dentry *, 2979 2979 struct iattr *); 2980 + extern u32 ext4_dio_alignment(struct inode *inode); 2980 2981 extern int ext4_getattr(struct user_namespace *, const struct path *, 2981 2982 struct kstat *, u32, unsigned int); 2982 2983 extern void ext4_evict_inode(struct inode *);
+26 -11
fs/ext4/file.c
··· 36 36 #include "acl.h" 37 37 #include "truncate.h" 38 38 39 - static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter) 39 + /* 40 + * Returns %true if the given DIO request should be attempted with DIO, or 41 + * %false if it should fall back to buffered I/O. 42 + * 43 + * DIO isn't well specified; when it's unsupported (either due to the request 44 + * being misaligned, or due to the file not supporting DIO at all), filesystems 45 + * either fall back to buffered I/O or return EINVAL. For files that don't use 46 + * any special features like encryption or verity, ext4 has traditionally 47 + * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too. 48 + * In this case, we should attempt the DIO, *not* fall back to buffered I/O. 49 + * 50 + * In contrast, in cases where DIO is unsupported due to ext4 features, ext4 51 + * traditionally falls back to buffered I/O. 52 + * 53 + * This function implements the traditional ext4 behavior in all these cases. 54 + */ 55 + static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter) 40 56 { 41 57 struct inode *inode = file_inode(iocb->ki_filp); 58 + u32 dio_align = ext4_dio_alignment(inode); 42 59 43 - if (!fscrypt_dio_supported(iocb, iter)) 60 + if (dio_align == 0) 44 61 return false; 45 - if (fsverity_active(inode)) 46 - return false; 47 - if (ext4_should_journal_data(inode)) 48 - return false; 49 - if (ext4_has_inline_data(inode)) 50 - return false; 51 - return true; 62 + 63 + if (dio_align == 1) 64 + return true; 65 + 66 + return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align); 52 67 } 53 68 54 69 static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) ··· 78 63 inode_lock_shared(inode); 79 64 } 80 65 81 - if (!ext4_dio_supported(iocb, to)) { 66 + if (!ext4_should_use_dio(iocb, to)) { 82 67 inode_unlock_shared(inode); 83 68 /* 84 69 * Fallback to buffered I/O if the operation being performed on ··· 526 511 } 527 512 528 513 /* Fallback to buffered I/O if the inode does not support direct I/O. */ 529 - if (!ext4_dio_supported(iocb, from)) { 514 + if (!ext4_should_use_dio(iocb, from)) { 530 515 if (ilock_shared) 531 516 inode_unlock_shared(inode); 532 517 else
+37
fs/ext4/inode.c
··· 5550 5550 return error; 5551 5551 } 5552 5552 5553 + u32 ext4_dio_alignment(struct inode *inode) 5554 + { 5555 + if (fsverity_active(inode)) 5556 + return 0; 5557 + if (ext4_should_journal_data(inode)) 5558 + return 0; 5559 + if (ext4_has_inline_data(inode)) 5560 + return 0; 5561 + if (IS_ENCRYPTED(inode)) { 5562 + if (!fscrypt_dio_supported(inode)) 5563 + return 0; 5564 + return i_blocksize(inode); 5565 + } 5566 + return 1; /* use the iomap defaults */ 5567 + } 5568 + 5553 5569 int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path, 5554 5570 struct kstat *stat, u32 request_mask, unsigned int query_flags) 5555 5571 { ··· 5579 5563 stat->result_mask |= STATX_BTIME; 5580 5564 stat->btime.tv_sec = ei->i_crtime.tv_sec; 5581 5565 stat->btime.tv_nsec = ei->i_crtime.tv_nsec; 5566 + } 5567 + 5568 + /* 5569 + * Return the DIO alignment restrictions if requested. We only return 5570 + * this information when requested, since on encrypted files it might 5571 + * take a fair bit of work to get if the file wasn't opened recently. 5572 + */ 5573 + if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { 5574 + u32 dio_align = ext4_dio_alignment(inode); 5575 + 5576 + stat->result_mask |= STATX_DIOALIGN; 5577 + if (dio_align == 1) { 5578 + struct block_device *bdev = inode->i_sb->s_bdev; 5579 + 5580 + /* iomap defaults */ 5581 + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; 5582 + stat->dio_offset_align = bdev_logical_block_size(bdev); 5583 + } else { 5584 + stat->dio_mem_align = dio_align; 5585 + stat->dio_offset_align = dio_align; 5586 + } 5582 5587 } 5583 5588 5584 5589 flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
-40
fs/f2fs/f2fs.h
··· 4471 4471 f2fs_mark_inode_dirty_sync(inode, true); 4472 4472 } 4473 4473 4474 - static inline int block_unaligned_IO(struct inode *inode, 4475 - struct kiocb *iocb, struct iov_iter *iter) 4476 - { 4477 - unsigned int i_blkbits = READ_ONCE(inode->i_blkbits); 4478 - unsigned int blocksize_mask = (1 << i_blkbits) - 1; 4479 - loff_t offset = iocb->ki_pos; 4480 - unsigned long align = offset | iov_iter_alignment(iter); 4481 - 4482 - return align & blocksize_mask; 4483 - } 4484 - 4485 4474 static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi, 4486 4475 int flag) 4487 4476 { ··· 4479 4490 if (flag != F2FS_GET_BLOCK_DIO) 4480 4491 return false; 4481 4492 return sbi->aligned_blksize; 4482 - } 4483 - 4484 - static inline bool f2fs_force_buffered_io(struct inode *inode, 4485 - struct kiocb *iocb, struct iov_iter *iter) 4486 - { 4487 - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); 4488 - int rw = iov_iter_rw(iter); 4489 - 4490 - if (!fscrypt_dio_supported(iocb, iter)) 4491 - return true; 4492 - if (fsverity_active(inode)) 4493 - return true; 4494 - if (f2fs_compressed_file(inode)) 4495 - return true; 4496 - 4497 - /* disallow direct IO if any of devices has unaligned blksize */ 4498 - if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize) 4499 - return true; 4500 - 4501 - if (f2fs_lfs_mode(sbi) && (rw == WRITE)) { 4502 - if (block_unaligned_IO(inode, iocb, iter)) 4503 - return true; 4504 - if (F2FS_IO_ALIGNED(sbi)) 4505 - return true; 4506 - } 4507 - if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED)) 4508 - return true; 4509 - 4510 - return false; 4511 4493 } 4512 4494 4513 4495 static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
+42 -1
fs/f2fs/file.c
··· 808 808 return 0; 809 809 } 810 810 811 + static bool f2fs_force_buffered_io(struct inode *inode, int rw) 812 + { 813 + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); 814 + 815 + if (!fscrypt_dio_supported(inode)) 816 + return true; 817 + if (fsverity_active(inode)) 818 + return true; 819 + if (f2fs_compressed_file(inode)) 820 + return true; 821 + 822 + /* disallow direct IO if any of devices has unaligned blksize */ 823 + if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize) 824 + return true; 825 + 826 + if (f2fs_lfs_mode(sbi) && rw == WRITE && F2FS_IO_ALIGNED(sbi)) 827 + return true; 828 + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED)) 829 + return true; 830 + 831 + return false; 832 + } 833 + 811 834 int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, 812 835 struct kstat *stat, u32 request_mask, unsigned int query_flags) 813 836 { ··· 845 822 stat->result_mask |= STATX_BTIME; 846 823 stat->btime.tv_sec = fi->i_crtime.tv_sec; 847 824 stat->btime.tv_nsec = fi->i_crtime.tv_nsec; 825 + } 826 + 827 + /* 828 + * Return the DIO alignment restrictions if requested. We only return 829 + * this information when requested, since on encrypted files it might 830 + * take a fair bit of work to get if the file wasn't opened recently. 831 + * 832 + * f2fs sometimes supports DIO reads but not DIO writes. STATX_DIOALIGN 833 + * cannot represent that, so in that case we report no DIO support. 834 + */ 835 + if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { 836 + unsigned int bsize = i_blocksize(inode); 837 + 838 + stat->result_mask |= STATX_DIOALIGN; 839 + if (!f2fs_force_buffered_io(inode, WRITE)) { 840 + stat->dio_mem_align = bsize; 841 + stat->dio_offset_align = bsize; 842 + } 848 843 } 849 844 850 845 flags = fi->i_flags; ··· 4223 4182 if (!(iocb->ki_flags & IOCB_DIRECT)) 4224 4183 return false; 4225 4184 4226 - if (f2fs_force_buffered_io(inode, iocb, iter)) 4185 + if (f2fs_force_buffered_io(inode, iov_iter_rw(iter))) 4227 4186 return false; 4228 4187 4229 4188 /*
+14
fs/stat.c
··· 5 5 * Copyright (C) 1991, 1992 Linus Torvalds 6 6 */ 7 7 8 + #include <linux/blkdev.h> 8 9 #include <linux/export.h> 9 10 #include <linux/mm.h> 10 11 #include <linux/errno.h> ··· 231 230 goto out; 232 231 233 232 error = vfs_getattr(&path, stat, request_mask, flags); 233 + 234 234 stat->mnt_id = real_mount(path.mnt)->mnt_id; 235 235 stat->result_mask |= STATX_MNT_ID; 236 + 236 237 if (path.mnt->mnt_root == path.dentry) 237 238 stat->attributes |= STATX_ATTR_MOUNT_ROOT; 238 239 stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; 240 + 241 + /* Handle STATX_DIOALIGN for block devices. */ 242 + if (request_mask & STATX_DIOALIGN) { 243 + struct inode *inode = d_backing_inode(path.dentry); 244 + 245 + if (S_ISBLK(inode->i_mode)) 246 + bdev_statx_dioalign(inode, stat); 247 + } 248 + 239 249 path_put(&path); 240 250 if (retry_estale(error, lookup_flags)) { 241 251 lookup_flags |= LOOKUP_REVAL; ··· 623 611 tmp.stx_dev_major = MAJOR(stat->dev); 624 612 tmp.stx_dev_minor = MINOR(stat->dev); 625 613 tmp.stx_mnt_id = stat->mnt_id; 614 + tmp.stx_dio_mem_align = stat->dio_mem_align; 615 + tmp.stx_dio_offset_align = stat->dio_offset_align; 626 616 627 617 return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; 628 618 }
+10
fs/xfs/xfs_iops.c
··· 604 604 stat->blksize = BLKDEV_IOSIZE; 605 605 stat->rdev = inode->i_rdev; 606 606 break; 607 + case S_IFREG: 608 + if (request_mask & STATX_DIOALIGN) { 609 + struct xfs_buftarg *target = xfs_inode_buftarg(ip); 610 + struct block_device *bdev = target->bt_bdev; 611 + 612 + stat->result_mask |= STATX_DIOALIGN; 613 + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; 614 + stat->dio_offset_align = bdev_logical_block_size(bdev); 615 + } 616 + fallthrough; 607 617 default: 608 618 stat->blksize = xfs_stat_blksize(ip); 609 619 stat->rdev = 0;
+4
include/linux/blkdev.h
··· 1498 1498 int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend); 1499 1499 int sync_blockdev_nowait(struct block_device *bdev); 1500 1500 void sync_bdevs(bool wait); 1501 + void bdev_statx_dioalign(struct inode *inode, struct kstat *stat); 1501 1502 void printk_all_partitions(void); 1502 1503 #else 1503 1504 static inline void invalidate_bdev(struct block_device *bdev) ··· 1513 1512 return 0; 1514 1513 } 1515 1514 static inline void sync_bdevs(bool wait) 1515 + { 1516 + } 1517 + static inline void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) 1516 1518 { 1517 1519 } 1518 1520 static inline void printk_all_partitions(void)
+2 -5
include/linux/fscrypt.h
··· 764 764 bool fscrypt_mergeable_bio_bh(struct bio *bio, 765 765 const struct buffer_head *next_bh); 766 766 767 - bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter); 767 + bool fscrypt_dio_supported(struct inode *inode); 768 768 769 769 u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks); 770 770 ··· 797 797 return true; 798 798 } 799 799 800 - static inline bool fscrypt_dio_supported(struct kiocb *iocb, 801 - struct iov_iter *iter) 800 + static inline bool fscrypt_dio_supported(struct inode *inode) 802 801 { 803 - const struct inode *inode = file_inode(iocb->ki_filp); 804 - 805 802 return !fscrypt_needs_contents_encryption(inode); 806 803 } 807 804
+2
include/linux/stat.h
··· 50 50 struct timespec64 btime; /* File creation time */ 51 51 u64 blocks; 52 52 u64 mnt_id; 53 + u32 dio_mem_align; 54 + u32 dio_offset_align; 53 55 }; 54 56 55 57 #endif
+3 -1
include/uapi/linux/stat.h
··· 124 124 __u32 stx_dev_minor; 125 125 /* 0x90 */ 126 126 __u64 stx_mnt_id; 127 - __u64 __spare2; 127 + __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */ 128 + __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ 128 129 /* 0xa0 */ 129 130 __u64 __spare3[12]; /* Spare space for future expansion */ 130 131 /* 0x100 */ ··· 153 152 #define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */ 154 153 #define STATX_BTIME 0x00000800U /* Want/got stx_btime */ 155 154 #define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */ 155 + #define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */ 156 156 157 157 #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ 158 158