Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ceph: add buffered/direct exclusionary locking for reads and writes

xfstest generic/451 intermittently fails. The test does O_DIRECT writes
to a file, and then reads back the result using buffered I/O, while
running a separate set of tasks that are also doing buffered reads.

The client will invalidate the cache prior to a direct write, but it's
easy for one of the other readers' replies to race in and reinstantiate
the invalidated range with stale data.

To fix this, we must to serialize direct I/O writes and buffered reads.
We could just sprinkle in some shared locks on the i_rwsem for reads,
and increase the exclusive footprint on the write side, but that would
cause O_DIRECT writes to end up serialized vs. other direct requests.

Instead, borrow the scheme used by nfs.ko. Buffered writes take the
i_rwsem exclusively, but buffered reads take a shared lock, allowing
them to run in parallel.

O_DIRECT requests also take a shared lock, but we need for them to not
run in parallel with buffered reads. A flag on the ceph_inode_info is
used to indicate whether it's in direct or buffered I/O mode. When a
conflicting request is submitted, it will block until the inode can be
flipped to the necessary mode.

Link: https://tracker.ceph.com/issues/40985
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>

authored by

Jeff Layton and committed by
Ilya Dryomov
321fe13c 4766815b

+200 -16
+1 -1
fs/ceph/Makefile
··· 6 6 obj-$(CONFIG_CEPH_FS) += ceph.o 7 7 8 8 ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ 9 - export.o caps.o snap.o xattr.o quota.o \ 9 + export.o caps.o snap.o xattr.o quota.o io.o \ 10 10 mds_client.o mdsmap.o strings.o ceph_frag.o \ 11 11 debugfs.o 12 12
+22 -13
fs/ceph/file.c
··· 15 15 #include "super.h" 16 16 #include "mds_client.h" 17 17 #include "cache.h" 18 + #include "io.h" 18 19 19 20 static __le32 ceph_flags_sys2wire(u32 flags) 20 21 { ··· 931 930 struct ceph_aio_request *aio_req = NULL; 932 931 int num_pages = 0; 933 932 int flags; 934 - int ret; 933 + int ret = 0; 935 934 struct timespec64 mtime = current_time(inode); 936 935 size_t count = iov_iter_count(iter); 937 936 loff_t pos = iocb->ki_pos; ··· 944 943 dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n", 945 944 (write ? "write" : "read"), file, pos, (unsigned)count, 946 945 snapc, snapc ? snapc->seq : 0); 947 - 948 - ret = filemap_write_and_wait_range(inode->i_mapping, 949 - pos, pos + count - 1); 950 - if (ret < 0) 951 - return ret; 952 946 953 947 if (write) { 954 948 int ret2 = invalidate_inode_pages2_range(inode->i_mapping, ··· 1280 1284 1281 1285 if (ci->i_inline_version == CEPH_INLINE_NONE) { 1282 1286 if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { 1287 + ceph_start_io_direct(inode); 1283 1288 ret = ceph_direct_read_write(iocb, to, 1284 1289 NULL, NULL); 1290 + ceph_end_io_direct(inode); 1285 1291 if (ret >= 0 && ret < len) 1286 1292 retry_op = CHECK_EOF; 1287 1293 } else { 1294 + ceph_start_io_read(inode); 1288 1295 ret = ceph_sync_read(iocb, to, &retry_op); 1296 + ceph_end_io_read(inode); 1289 1297 } 1290 1298 } else { 1291 1299 retry_op = READ_INLINE; ··· 1300 1300 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, 1301 1301 ceph_cap_string(got)); 1302 1302 ceph_add_rw_context(fi, &rw_ctx); 1303 + ceph_start_io_read(inode); 1303 1304 ret = generic_file_read_iter(iocb, to); 1305 + ceph_end_io_read(inode); 1304 1306 ceph_del_rw_context(fi, &rw_ctx); 1305 1307 } 1306 1308 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", ··· 1411 1409 return -ENOMEM; 1412 1410 1413 1411 retry_snap: 1414 - inode_lock(inode); 1412 + if (iocb->ki_flags & IOCB_DIRECT) 1413 + ceph_start_io_direct(inode); 1414 + else 1415 + ceph_start_io_write(inode); 1415 1416 1416 1417 /* We can write back this queue in page reclaim */ 1417 1418 current->backing_dev_info = inode_to_bdi(inode); ··· 1485 1480 (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { 1486 1481 struct ceph_snap_context *snapc; 1487 1482 struct iov_iter data; 1488 - inode_unlock(inode); 1489 1483 1490 1484 spin_lock(&ci->i_ceph_lock); 1491 1485 if (__ceph_have_pending_cap_snap(ci)) { ··· 1501 1497 1502 1498 /* we might need to revert back to that point */ 1503 1499 data = *from; 1504 - if (iocb->ki_flags & IOCB_DIRECT) 1500 + if (iocb->ki_flags & IOCB_DIRECT) { 1505 1501 written = ceph_direct_read_write(iocb, &data, snapc, 1506 1502 &prealloc_cf); 1507 - else 1503 + ceph_end_io_direct(inode); 1504 + } else { 1508 1505 written = ceph_sync_write(iocb, &data, pos, snapc); 1506 + ceph_end_io_write(inode); 1507 + } 1509 1508 if (written > 0) 1510 1509 iov_iter_advance(from, written); 1511 1510 ceph_put_snap_context(snapc); ··· 1523 1516 written = generic_perform_write(file, from, pos); 1524 1517 if (likely(written >= 0)) 1525 1518 iocb->ki_pos = pos + written; 1526 - inode_unlock(inode); 1519 + ceph_end_io_write(inode); 1527 1520 } 1528 1521 1529 1522 if (written >= 0) { ··· 1558 1551 } 1559 1552 1560 1553 goto out_unlocked; 1561 - 1562 1554 out: 1563 - inode_unlock(inode); 1555 + if (iocb->ki_flags & IOCB_DIRECT) 1556 + ceph_end_io_direct(inode); 1557 + else 1558 + ceph_end_io_write(inode); 1564 1559 out_unlocked: 1565 1560 ceph_free_cap_flush(prealloc_cf); 1566 1561 current->backing_dev_info = NULL;
+163
fs/ceph/io.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (c) 2016 Trond Myklebust 4 + * Copyright (c) 2019 Jeff Layton 5 + * 6 + * I/O and data path helper functionality. 7 + * 8 + * Heavily borrowed from equivalent code in fs/nfs/io.c 9 + */ 10 + 11 + #include <linux/ceph/ceph_debug.h> 12 + 13 + #include <linux/types.h> 14 + #include <linux/kernel.h> 15 + #include <linux/rwsem.h> 16 + #include <linux/fs.h> 17 + 18 + #include "super.h" 19 + #include "io.h" 20 + 21 + /* Call with exclusively locked inode->i_rwsem */ 22 + static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode) 23 + { 24 + lockdep_assert_held_write(&inode->i_rwsem); 25 + 26 + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) { 27 + spin_lock(&ci->i_ceph_lock); 28 + ci->i_ceph_flags &= ~CEPH_I_ODIRECT; 29 + spin_unlock(&ci->i_ceph_lock); 30 + inode_dio_wait(inode); 31 + } 32 + } 33 + 34 + /** 35 + * ceph_start_io_read - declare the file is being used for buffered reads 36 + * @inode: file inode 37 + * 38 + * Declare that a buffered read operation is about to start, and ensure 39 + * that we block all direct I/O. 40 + * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset, 41 + * and holds a shared lock on inode->i_rwsem to ensure that the flag 42 + * cannot be changed. 43 + * In practice, this means that buffered read operations are allowed to 44 + * execute in parallel, thanks to the shared lock, whereas direct I/O 45 + * operations need to wait to grab an exclusive lock in order to set 46 + * CEPH_I_ODIRECT. 47 + * Note that buffered writes and truncates both take a write lock on 48 + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. 49 + */ 50 + void 51 + ceph_start_io_read(struct inode *inode) 52 + { 53 + struct ceph_inode_info *ci = ceph_inode(inode); 54 + 55 + /* Be an optimist! */ 56 + down_read(&inode->i_rwsem); 57 + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) 58 + return; 59 + up_read(&inode->i_rwsem); 60 + /* Slow path.... */ 61 + down_write(&inode->i_rwsem); 62 + ceph_block_o_direct(ci, inode); 63 + downgrade_write(&inode->i_rwsem); 64 + } 65 + 66 + /** 67 + * ceph_end_io_read - declare that the buffered read operation is done 68 + * @inode: file inode 69 + * 70 + * Declare that a buffered read operation is done, and release the shared 71 + * lock on inode->i_rwsem. 72 + */ 73 + void 74 + ceph_end_io_read(struct inode *inode) 75 + { 76 + up_read(&inode->i_rwsem); 77 + } 78 + 79 + /** 80 + * ceph_start_io_write - declare the file is being used for buffered writes 81 + * @inode: file inode 82 + * 83 + * Declare that a buffered write operation is about to start, and ensure 84 + * that we block all direct I/O. 85 + */ 86 + void 87 + ceph_start_io_write(struct inode *inode) 88 + { 89 + down_write(&inode->i_rwsem); 90 + ceph_block_o_direct(ceph_inode(inode), inode); 91 + } 92 + 93 + /** 94 + * ceph_end_io_write - declare that the buffered write operation is done 95 + * @inode: file inode 96 + * 97 + * Declare that a buffered write operation is done, and release the 98 + * lock on inode->i_rwsem. 99 + */ 100 + void 101 + ceph_end_io_write(struct inode *inode) 102 + { 103 + up_write(&inode->i_rwsem); 104 + } 105 + 106 + /* Call with exclusively locked inode->i_rwsem */ 107 + static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode) 108 + { 109 + lockdep_assert_held_write(&inode->i_rwsem); 110 + 111 + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) { 112 + spin_lock(&ci->i_ceph_lock); 113 + ci->i_ceph_flags |= CEPH_I_ODIRECT; 114 + spin_unlock(&ci->i_ceph_lock); 115 + /* FIXME: unmap_mapping_range? */ 116 + filemap_write_and_wait(inode->i_mapping); 117 + } 118 + } 119 + 120 + /** 121 + * ceph_end_io_direct - declare the file is being used for direct i/o 122 + * @inode: file inode 123 + * 124 + * Declare that a direct I/O operation is about to start, and ensure 125 + * that we block all buffered I/O. 126 + * On exit, the function ensures that the CEPH_I_ODIRECT flag is set, 127 + * and holds a shared lock on inode->i_rwsem to ensure that the flag 128 + * cannot be changed. 129 + * In practice, this means that direct I/O operations are allowed to 130 + * execute in parallel, thanks to the shared lock, whereas buffered I/O 131 + * operations need to wait to grab an exclusive lock in order to clear 132 + * CEPH_I_ODIRECT. 133 + * Note that buffered writes and truncates both take a write lock on 134 + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. 135 + */ 136 + void 137 + ceph_start_io_direct(struct inode *inode) 138 + { 139 + struct ceph_inode_info *ci = ceph_inode(inode); 140 + 141 + /* Be an optimist! */ 142 + down_read(&inode->i_rwsem); 143 + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) 144 + return; 145 + up_read(&inode->i_rwsem); 146 + /* Slow path.... */ 147 + down_write(&inode->i_rwsem); 148 + ceph_block_buffered(ci, inode); 149 + downgrade_write(&inode->i_rwsem); 150 + } 151 + 152 + /** 153 + * ceph_end_io_direct - declare that the direct i/o operation is done 154 + * @inode: file inode 155 + * 156 + * Declare that a direct I/O operation is done, and release the shared 157 + * lock on inode->i_rwsem. 158 + */ 159 + void 160 + ceph_end_io_direct(struct inode *inode) 161 + { 162 + up_read(&inode->i_rwsem); 163 + }
+12
fs/ceph/io.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _FS_CEPH_IO_H 3 + #define _FS_CEPH_IO_H 4 + 5 + void ceph_start_io_read(struct inode *inode); 6 + void ceph_end_io_read(struct inode *inode); 7 + void ceph_start_io_write(struct inode *inode); 8 + void ceph_end_io_write(struct inode *inode); 9 + void ceph_start_io_direct(struct inode *inode); 10 + void ceph_end_io_direct(struct inode *inode); 11 + 12 + #endif /* FS_CEPH_IO_H */
+2 -2
fs/ceph/super.h
··· 513 513 #define CEPH_I_SEC_INITED (1 << 6) /* security initialized */ 514 514 #define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */ 515 515 #define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */ 516 - #define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */ 516 + #define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */ 517 517 #define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */ 518 518 #define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */ 519 - 519 + #define CEPH_I_ODIRECT (1 << 12) /* inode in direct I/O mode */ 520 520 521 521 /* 522 522 * Masks of ceph inode work.