Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bcachefs: Buffered write path now can avoid the inode lock

Non append, non extending buffered writes can now avoid taking the inode
lock.

To ensure atomicity of writes w.r.t. other writes, we lock every folio
that we'll be writing to, and if this fails we fall back to taking the
inode lock.

Extensive comments are provided as to corner cases.

Link: https://lore.kernel.org/linux-fsdevel/Zdkxfspq3urnrM6I@bombadil.infradead.org/
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

+112 -42
+2 -1
fs/bcachefs/errcode.h
··· 250 250 x(BCH_ERR_nopromote, nopromote_congested) \ 251 251 x(BCH_ERR_nopromote, nopromote_in_flight) \ 252 252 x(BCH_ERR_nopromote, nopromote_no_writes) \ 253 - x(BCH_ERR_nopromote, nopromote_enomem) 253 + x(BCH_ERR_nopromote, nopromote_enomem) \ 254 + x(0, need_inode_lock) 254 255 255 256 enum bch_errcode { 256 257 BCH_ERR_START = 2048,
+110 -41
fs/bcachefs/fs-io-buffered.c
··· 810 810 static int __bch2_buffered_write(struct bch_inode_info *inode, 811 811 struct address_space *mapping, 812 812 struct iov_iter *iter, 813 - loff_t pos, unsigned len) 813 + loff_t pos, unsigned len, 814 + bool inode_locked) 814 815 { 815 816 struct bch_fs *c = inode->v.i_sb->s_fs_info; 816 817 struct bch2_folio_reservation res; ··· 835 834 goto out; 836 835 837 836 BUG_ON(!fs.nr); 837 + 838 + /* 839 + * If we're not using the inode lock, we need to lock all the folios for 840 + * atomiticity of writes vs. other writes: 841 + */ 842 + if (!inode_locked && folio_end_pos(darray_last(fs)) < end) { 843 + ret = -BCH_ERR_need_inode_lock; 844 + goto out; 845 + } 838 846 839 847 f = darray_first(fs); 840 848 if (pos != folio_pos(f) && !folio_test_uptodate(f)) { ··· 939 929 end = pos + copied; 940 930 941 931 spin_lock(&inode->v.i_lock); 942 - if (end > inode->v.i_size) 932 + if (end > inode->v.i_size) { 933 + BUG_ON(!inode_locked); 943 934 i_size_write(&inode->v, end); 935 + } 944 936 spin_unlock(&inode->v.i_lock); 945 937 946 938 f_pos = pos; ··· 986 974 struct file *file = iocb->ki_filp; 987 975 struct address_space *mapping = file->f_mapping; 988 976 struct bch_inode_info *inode = file_bch_inode(file); 989 - loff_t pos = iocb->ki_pos; 990 - ssize_t written = 0; 991 - int ret = 0; 977 + loff_t pos; 978 + bool inode_locked = false; 979 + ssize_t written = 0, written2 = 0, ret = 0; 980 + 981 + /* 982 + * We don't take the inode lock unless i_size will be changing. Folio 983 + * locks provide exclusion with other writes, and the pagecache add lock 984 + * provides exclusion with truncate and hole punching. 985 + * 986 + * There is one nasty corner case where atomicity would be broken 987 + * without great care: when copying data from userspace to the page 988 + * cache, we do that with faults disable - a page fault would recurse 989 + * back into the filesystem, taking filesystem locks again, and 990 + * deadlock; so it's done with faults disabled, and we fault in the user 991 + * buffer when we aren't holding locks. 992 + * 993 + * If we do part of the write, but we then race and in the userspace 994 + * buffer have been evicted and are no longer resident, then we have to 995 + * drop our folio locks to re-fault them in, breaking write atomicity. 996 + * 997 + * To fix this, we restart the write from the start, if we weren't 998 + * holding the inode lock. 999 + * 1000 + * There is another wrinkle after that; if we restart the write from the 1001 + * start, and then get an unrecoverable error, we _cannot_ claim to 1002 + * userspace that we did not write data we actually did - so we must 1003 + * track (written2) the most we ever wrote. 1004 + */ 1005 + 1006 + if ((iocb->ki_flags & IOCB_APPEND) || 1007 + (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) { 1008 + inode_lock(&inode->v); 1009 + inode_locked = true; 1010 + } 1011 + 1012 + ret = generic_write_checks(iocb, iter); 1013 + if (ret <= 0) 1014 + goto unlock; 1015 + 1016 + ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0); 1017 + if (ret) { 1018 + if (!inode_locked) { 1019 + inode_lock(&inode->v); 1020 + inode_locked = true; 1021 + ret = file_remove_privs_flags(file, 0); 1022 + } 1023 + if (ret) 1024 + goto unlock; 1025 + } 1026 + 1027 + ret = file_update_time(file); 1028 + if (ret) 1029 + goto unlock; 1030 + 1031 + pos = iocb->ki_pos; 992 1032 993 1033 bch2_pagecache_add_get(inode); 1034 + 1035 + if (!inode_locked && 1036 + (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) 1037 + goto get_inode_lock; 994 1038 995 1039 do { 996 1040 unsigned offset = pos & (PAGE_SIZE - 1); ··· 1072 1004 } 1073 1005 } 1074 1006 1007 + if (unlikely(bytes != iov_iter_count(iter) && !inode_locked)) 1008 + goto get_inode_lock; 1009 + 1075 1010 if (unlikely(fatal_signal_pending(current))) { 1076 1011 ret = -EINTR; 1077 1012 break; 1078 1013 } 1079 1014 1080 - ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); 1015 + ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked); 1016 + if (ret == -BCH_ERR_need_inode_lock) 1017 + goto get_inode_lock; 1081 1018 if (unlikely(ret < 0)) 1082 1019 break; 1083 1020 ··· 1103 1030 } 1104 1031 pos += ret; 1105 1032 written += ret; 1033 + written2 = max(written, written2); 1034 + 1035 + if (ret != bytes && !inode_locked) 1036 + goto get_inode_lock; 1106 1037 ret = 0; 1107 1038 1108 1039 balance_dirty_pages_ratelimited(mapping); 1040 + 1041 + if (0) { 1042 + get_inode_lock: 1043 + bch2_pagecache_add_put(inode); 1044 + inode_lock(&inode->v); 1045 + inode_locked = true; 1046 + bch2_pagecache_add_get(inode); 1047 + 1048 + iov_iter_revert(iter, written); 1049 + pos -= written; 1050 + written = 0; 1051 + ret = 0; 1052 + } 1109 1053 } while (iov_iter_count(iter)); 1110 - 1111 1054 bch2_pagecache_add_put(inode); 1112 - 1113 - return written ? written : ret; 1114 - } 1115 - 1116 - ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) 1117 - { 1118 - struct file *file = iocb->ki_filp; 1119 - struct bch_inode_info *inode = file_bch_inode(file); 1120 - ssize_t ret; 1121 - 1122 - if (iocb->ki_flags & IOCB_DIRECT) { 1123 - ret = bch2_direct_write(iocb, from); 1124 - goto out; 1125 - } 1126 - 1127 - inode_lock(&inode->v); 1128 - 1129 - ret = generic_write_checks(iocb, from); 1130 - if (ret <= 0) 1131 - goto unlock; 1132 - 1133 - ret = file_remove_privs(file); 1134 - if (ret) 1135 - goto unlock; 1136 - 1137 - ret = file_update_time(file); 1138 - if (ret) 1139 - goto unlock; 1140 - 1141 - ret = bch2_buffered_write(iocb, from); 1142 - if (likely(ret > 0)) 1143 - iocb->ki_pos += ret; 1144 1055 unlock: 1145 - inode_unlock(&inode->v); 1056 + if (inode_locked) 1057 + inode_unlock(&inode->v); 1146 1058 1059 + iocb->ki_pos += written; 1060 + 1061 + ret = max(written, written2) ?: ret; 1147 1062 if (ret > 0) 1148 1063 ret = generic_write_sync(iocb, ret); 1149 - out: 1064 + return ret; 1065 + } 1066 + 1067 + ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter) 1068 + { 1069 + ssize_t ret = iocb->ki_flags & IOCB_DIRECT 1070 + ? bch2_direct_write(iocb, iter) 1071 + : bch2_buffered_write(iocb, iter); 1072 + 1150 1073 return bch2_err_class(ret); 1151 1074 } 1152 1075