Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bcachefs: btree write buffer now slurps keys from journal

Previosuly, the transaction commit path would have to add keys to the
btree write buffer as a separate operation, requiring additional global
synchronization.

This patch introduces a new journal entry type, which indicates that the
keys need to be copied into the btree write buffer prior to being
written out. We switch the journal entry type back to
JSET_ENTRY_btree_keys prior to write, so this is not an on disk format
change.

Flushing the btree write buffer may require pulling keys out of journal
entries yet to be written, and quiescing outstanding journal
reservations; we previously added journal->buf_lock for synchronization
with the journal write path.

We also can't put strict bounds on the number of keys in the journal
destined for the write buffer, which means we might overflow the size of
the preallocated buffer and have to reallocate - this introduces a
potentially fatal memory allocation failure. This is something we'll
have to watch for, if it becomes an issue in practice we can do
additional mitigation.

The transaction commit path no longer has to explicitly check if the
write buffer is full and wait on flushing; this is another performance
optimization. Instead, when the btree write buffer is close to full we
change the journal watermark, so that only reservations for journal
reclaim are allowed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

+520 -232
+11
fs/bcachefs/bcachefs.h
··· 427 427 x(blocked_journal_max_in_flight) \ 428 428 x(blocked_allocate) \ 429 429 x(blocked_allocate_open_bucket) \ 430 + x(blocked_write_buffer_full) \ 430 431 x(nocow_lock_contended) 431 432 432 433 enum bch_time_stats { ··· 1120 1119 atomic_long_inc(&c->writes[ref]); 1121 1120 #else 1122 1121 percpu_ref_get(&c->writes); 1122 + #endif 1123 + } 1124 + 1125 + static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) 1126 + { 1127 + #ifdef BCH_WRITE_REF_DEBUG 1128 + return !test_bit(BCH_FS_going_ro, &c->flags) && 1129 + atomic_long_inc_not_zero(&c->writes[ref]); 1130 + #else 1131 + return percpu_ref_tryget(&c->writes); 1123 1132 #endif 1124 1133 } 1125 1134
+2 -1
fs/bcachefs/bcachefs_format.h
··· 2162 2162 x(clock, 7) \ 2163 2163 x(dev_usage, 8) \ 2164 2164 x(log, 9) \ 2165 - x(overwrite, 10) 2165 + x(overwrite, 10) \ 2166 + x(write_buffer_keys, 11) 2166 2167 2167 2168 enum { 2168 2169 #define x(f, nr) BCH_JSET_ENTRY_##f = nr,
+1 -51
fs/bcachefs/btree_trans_commit.c
··· 659 659 i->k->k.needs_whiteout = false; 660 660 } 661 661 662 - if (trans->nr_wb_updates && 663 - trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size) 664 - return -BCH_ERR_btree_insert_need_flush_buffer; 665 - 666 662 /* 667 663 * Don't get journal reservation until after we know insert will 668 664 * succeed: ··· 692 696 if (trans->fs_usage_deltas && 693 697 bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) 694 698 return -BCH_ERR_btree_insert_need_mark_replicas; 695 - 696 - if (trans->nr_wb_updates) { 697 - EBUG_ON(flags & BCH_TRANS_COMMIT_no_journal_res); 698 - 699 - ret = bch2_btree_insert_keys_write_buffer(trans); 700 - if (ret) 701 - goto revert_fs_usage; 702 - } 703 699 704 700 h = trans->hooks; 705 701 while (h) { ··· 745 757 746 758 trans_for_each_wb_update(trans, wb) { 747 759 entry = bch2_journal_add_entry(j, &trans->journal_res, 748 - BCH_JSET_ENTRY_btree_keys, 760 + BCH_JSET_ENTRY_write_buffer_keys, 749 761 wb->btree, 0, 750 762 wb->k.k.u64s); 751 763 bkey_copy((struct bkey_i *) entry->start, &wb->k); ··· 936 948 937 949 ret = bch2_trans_relock(trans); 938 950 break; 939 - case -BCH_ERR_btree_insert_need_flush_buffer: { 940 - struct btree_write_buffer *wb = &c->btree_write_buffer; 941 - 942 - ret = 0; 943 - 944 - if (wb->state.nr > wb->size * 3 / 4) { 945 - bch2_trans_unlock(trans); 946 - mutex_lock(&wb->flush_lock); 947 - 948 - if (wb->state.nr > wb->size * 3 / 4) { 949 - bch2_trans_begin(trans); 950 - ret = bch2_btree_write_buffer_flush_locked(trans); 951 - mutex_unlock(&wb->flush_lock); 952 - if (!ret) { 953 - trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); 954 - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); 955 - } 956 - } else { 957 - mutex_unlock(&wb->flush_lock); 958 - ret = bch2_trans_relock(trans); 959 - } 960 - } 961 - break; 962 - } 963 951 default: 964 952 BUG_ON(ret >= 0); 965 953 break; ··· 1032 1068 ret = bch2_trans_commit_get_rw_cold(trans, flags); 1033 1069 if (ret) 1034 1070 goto out_reset; 1035 - } 1036 - 1037 - if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 && 1038 - mutex_trylock(&c->btree_write_buffer.flush_lock)) { 1039 - bch2_trans_begin(trans); 1040 - bch2_trans_unlock(trans); 1041 - 1042 - ret = bch2_btree_write_buffer_flush_locked(trans); 1043 - mutex_unlock(&c->btree_write_buffer.flush_lock); 1044 - if (!ret) { 1045 - trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); 1046 - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); 1047 - } 1048 - goto out; 1049 1071 } 1050 1072 1051 1073 EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
+307 -135
fs/bcachefs/btree_write_buffer.c
··· 7 7 #include "btree_write_buffer.h" 8 8 #include "error.h" 9 9 #include "journal.h" 10 + #include "journal_io.h" 10 11 #include "journal_reclaim.h" 11 12 13 + #include <linux/prefetch.h> 12 14 #include <linux/sort.h> 13 15 14 16 static int bch2_btree_write_buffer_journal_flush(struct journal *, 15 17 struct journal_entry_pin *, u64); 16 18 17 - static int btree_write_buffered_key_cmp(const void *_l, const void *_r) 19 + static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *); 20 + 21 + static inline int wb_key_cmp(const void *_l, const void *_r) 18 22 { 19 - const struct btree_write_buffered_key *l = _l; 20 - const struct btree_write_buffered_key *r = _r; 23 + const struct wb_key_ref *l = _l; 24 + const struct wb_key_ref *r = _r; 21 25 22 - return cmp_int(l->btree, r->btree) ?: 23 - bpos_cmp(l->k.k.p, r->k.k.p) ?: 24 - cmp_int(l->journal_seq, r->journal_seq) ?: 25 - cmp_int(l->journal_offset, r->journal_offset); 26 - } 27 - 28 - static int btree_write_buffered_journal_cmp(const void *_l, const void *_r) 29 - { 30 - const struct btree_write_buffered_key *l = _l; 31 - const struct btree_write_buffered_key *r = _r; 32 - 33 - return cmp_int(l->journal_seq, r->journal_seq); 26 + return cmp_int(l->hi, r->hi) ?: 27 + cmp_int(l->mi, r->mi) ?: 28 + cmp_int(l->lo, r->lo); 34 29 } 35 30 36 31 static noinline int wb_flush_one_slowpath(struct btree_trans *trans, ··· 54 59 int ret; 55 60 56 61 EBUG_ON(!wb->journal_seq); 62 + EBUG_ON(!c->btree_write_buffer.flushing.pin.seq); 63 + EBUG_ON(c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); 64 + 57 65 ret = bch2_btree_iter_traverse(iter); 58 66 if (ret) 59 67 return ret; ··· 89 91 return 0; 90 92 } 91 93 92 - static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb) 93 - { 94 - union btree_write_buffer_state old, new; 95 - u64 v = READ_ONCE(wb->state.v); 96 - 97 - do { 98 - old.v = new.v = v; 99 - 100 - new.nr = 0; 101 - new.idx++; 102 - } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); 103 - 104 - while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1) 105 - cpu_relax(); 106 - 107 - smp_mb(); 108 - 109 - return old; 110 - } 111 - 112 94 /* 113 95 * Update a btree with a write buffered key using the journal seq of the 114 96 * original write buffer insert. ··· 118 140 return ret; 119 141 } 120 142 121 - int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) 143 + static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb) 144 + { 145 + struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer); 146 + struct journal *j = &c->journal; 147 + 148 + if (!wb->inc.keys.nr) 149 + return; 150 + 151 + bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin, 152 + bch2_btree_write_buffer_journal_flush); 153 + 154 + darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr)); 155 + darray_resize(&wb->sorted, wb->flushing.keys.size); 156 + 157 + if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) { 158 + swap(wb->flushing.keys, wb->inc.keys); 159 + goto out; 160 + } 161 + 162 + size_t nr = min(darray_room(wb->flushing.keys), 163 + wb->sorted.size - wb->flushing.keys.nr); 164 + nr = min(nr, wb->inc.keys.nr); 165 + 166 + memcpy(&darray_top(wb->flushing.keys), 167 + wb->inc.keys.data, 168 + sizeof(wb->inc.keys.data[0]) * nr); 169 + 170 + memmove(wb->inc.keys.data, 171 + wb->inc.keys.data + nr, 172 + sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr)); 173 + 174 + wb->flushing.keys.nr += nr; 175 + wb->inc.keys.nr -= nr; 176 + out: 177 + if (!wb->inc.keys.nr) 178 + bch2_journal_pin_drop(j, &wb->inc.pin); 179 + else 180 + bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin, 181 + bch2_btree_write_buffer_journal_flush); 182 + 183 + if (j->watermark) { 184 + spin_lock(&j->lock); 185 + bch2_journal_set_watermark(j); 186 + spin_unlock(&j->lock); 187 + } 188 + 189 + BUG_ON(wb->sorted.size < wb->flushing.keys.nr); 190 + } 191 + 192 + static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) 122 193 { 123 194 struct bch_fs *c = trans->c; 124 195 struct journal *j = &c->journal; 125 196 struct btree_write_buffer *wb = &c->btree_write_buffer; 126 - struct journal_entry_pin pin; 127 - struct btree_write_buffered_key *i, *keys; 197 + struct wb_key_ref *i; 128 198 struct btree_iter iter = { NULL }; 129 - size_t nr = 0, skipped = 0, fast = 0, slowpath = 0; 199 + size_t skipped = 0, fast = 0, slowpath = 0; 130 200 bool write_locked = false; 131 - union btree_write_buffer_state s; 132 201 int ret = 0; 133 202 134 - memset(&pin, 0, sizeof(pin)); 203 + bch2_trans_unlock(trans); 204 + bch2_trans_begin(trans); 135 205 136 - bch2_journal_pin_copy(j, &pin, &wb->journal_pin, 137 - bch2_btree_write_buffer_journal_flush); 138 - bch2_journal_pin_drop(j, &wb->journal_pin); 206 + mutex_lock(&wb->inc.lock); 207 + move_keys_from_inc_to_flushing(wb); 208 + mutex_unlock(&wb->inc.lock); 139 209 140 - s = btree_write_buffer_switch(wb); 141 - keys = wb->keys[s.idx]; 142 - nr = s.nr; 210 + for (size_t i = 0; i < wb->flushing.keys.nr; i++) { 211 + wb->sorted.data[i].idx = i; 212 + wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree; 213 + wb->sorted.data[i].pos = wb->flushing.keys.data[i].k.k.p; 214 + } 215 + wb->sorted.nr = wb->flushing.keys.nr; 143 216 144 217 /* 145 218 * We first sort so that we can detect and skip redundant updates, and ··· 206 177 * If that happens, simply skip the key so we can optimistically insert 207 178 * as many keys as possible in the fast path. 208 179 */ 209 - sort(keys, nr, sizeof(keys[0]), 210 - btree_write_buffered_key_cmp, NULL); 180 + sort(wb->sorted.data, wb->sorted.nr, 181 + sizeof(wb->sorted.data[0]), 182 + wb_key_cmp, NULL); 211 183 212 - for (i = keys; i < keys + nr; i++) { 213 - if (i + 1 < keys + nr && 184 + darray_for_each(wb->sorted, i) { 185 + struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; 186 + 187 + for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++) 188 + prefetch(&wb->flushing.keys.data[n->idx]); 189 + 190 + BUG_ON(!k->journal_seq); 191 + 192 + if (i + 1 < &darray_top(wb->sorted) && 214 193 i[0].btree == i[1].btree && 215 - bpos_eq(i[0].k.k.p, i[1].k.k.p)) { 194 + bpos_eq(i[0].pos, i[1].pos)) { 195 + struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx]; 196 + 216 197 skipped++; 217 - i->journal_seq = 0; 198 + n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq); 199 + k->journal_seq = 0; 218 200 continue; 219 201 } 220 202 221 - if (write_locked && 222 - (iter.path->btree_id != i->btree || 223 - bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) { 224 - bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); 225 - write_locked = false; 203 + if (write_locked) { 204 + struct btree_path *path = iter.path; 205 + 206 + if (path->btree_id != i->btree || 207 + bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) { 208 + bch2_btree_node_unlock_write(trans, path, path->l[0].b); 209 + write_locked = false; 210 + } 226 211 } 227 212 228 - if (!iter.path || iter.path->btree_id != i->btree) { 213 + if (!iter.path || iter.btree_id != k->btree) { 229 214 bch2_trans_iter_exit(trans, &iter); 230 - bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, 215 + bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p, 231 216 BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS); 232 217 } 233 218 234 - bch2_btree_iter_set_pos(&iter, i->k.k.p); 219 + bch2_btree_iter_set_pos(&iter, k->k.k.p); 235 220 iter.path->preserve = false; 236 221 237 222 do { ··· 254 211 break; 255 212 } 256 213 257 - ret = wb_flush_one(trans, &iter, i, &write_locked, &fast); 214 + ret = wb_flush_one(trans, &iter, k, &write_locked, &fast); 258 215 if (!write_locked) 259 216 bch2_trans_begin(trans); 260 217 } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); 261 218 262 219 if (!ret) { 263 - i->journal_seq = 0; 220 + k->journal_seq = 0; 264 221 } else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { 265 222 slowpath++; 266 223 ret = 0; ··· 282 239 * The fastpath zapped the seq of keys that were successfully flushed so 283 240 * we can skip those here. 284 241 */ 285 - trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, nr); 242 + trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr); 286 243 287 - sort(keys, nr, sizeof(keys[0]), 288 - btree_write_buffered_journal_cmp, 289 - NULL); 290 - 291 - for (i = keys; i < keys + nr; i++) { 244 + struct btree_write_buffered_key *i; 245 + darray_for_each(wb->flushing.keys, i) { 292 246 if (!i->journal_seq) 293 247 continue; 294 248 295 - bch2_journal_pin_update(j, i->journal_seq, &pin, 296 - bch2_btree_write_buffer_journal_flush); 249 + bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin, 250 + bch2_btree_write_buffer_journal_flush); 251 + 252 + bch2_trans_begin(trans); 297 253 298 254 ret = commit_do(trans, NULL, NULL, 299 255 BCH_WATERMARK_reclaim| ··· 307 265 } 308 266 err: 309 267 bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)); 310 - trace_write_buffer_flush(trans, nr, skipped, fast, wb->size); 311 - bch2_journal_pin_drop(j, &pin); 268 + trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0); 269 + bch2_journal_pin_drop(j, &wb->flushing.pin); 270 + wb->flushing.keys.nr = 0; 312 271 return ret; 272 + } 273 + 274 + static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq) 275 + { 276 + struct journal *j = &c->journal; 277 + struct journal_buf *buf; 278 + int ret = 0; 279 + 280 + while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) { 281 + ret = bch2_journal_keys_to_write_buffer(c, buf); 282 + mutex_unlock(&j->buf_lock); 283 + } 284 + 285 + return ret; 286 + } 287 + 288 + static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq) 289 + { 290 + struct bch_fs *c = trans->c; 291 + struct btree_write_buffer *wb = &c->btree_write_buffer; 292 + int ret = 0, fetch_from_journal_err; 293 + 294 + do { 295 + bch2_trans_unlock(trans); 296 + 297 + fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq); 298 + 299 + /* 300 + * On memory allocation failure, bch2_btree_write_buffer_flush_locked() 301 + * is not guaranteed to empty wb->inc: 302 + */ 303 + mutex_lock(&wb->flushing.lock); 304 + ret = bch2_btree_write_buffer_flush_locked(trans); 305 + mutex_unlock(&wb->flushing.lock); 306 + } while (!ret && 307 + (fetch_from_journal_err || 308 + (wb->inc.pin.seq && wb->inc.pin.seq <= seq) || 309 + (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq))); 310 + 311 + return ret; 312 + } 313 + 314 + static int bch2_btree_write_buffer_journal_flush(struct journal *j, 315 + struct journal_entry_pin *_pin, u64 seq) 316 + { 317 + struct bch_fs *c = container_of(j, struct bch_fs, journal); 318 + 319 + return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq)); 313 320 } 314 321 315 322 int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) 316 323 { 317 324 struct bch_fs *c = trans->c; 318 325 319 - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer)) 320 - return -BCH_ERR_erofs_no_writes; 321 - 322 326 trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_); 323 327 324 - bch2_trans_unlock(trans); 325 - mutex_lock(&c->btree_write_buffer.flush_lock); 326 - int ret = bch2_btree_write_buffer_flush_locked(trans); 327 - mutex_unlock(&c->btree_write_buffer.flush_lock); 328 - bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); 329 - return ret; 328 + return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal)); 330 329 } 331 330 332 331 int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans) ··· 376 293 struct btree_write_buffer *wb = &c->btree_write_buffer; 377 294 int ret = 0; 378 295 379 - if (mutex_trylock(&wb->flush_lock)) { 296 + if (mutex_trylock(&wb->flushing.lock)) { 380 297 ret = bch2_btree_write_buffer_flush_locked(trans); 381 - mutex_unlock(&wb->flush_lock); 298 + mutex_unlock(&wb->flushing.lock); 382 299 } 383 300 384 301 return ret; ··· 396 313 return ret; 397 314 } 398 315 399 - static int bch2_btree_write_buffer_journal_flush(struct journal *j, 400 - struct journal_entry_pin *_pin, u64 seq) 316 + static void bch2_btree_write_buffer_flush_work(struct work_struct *work) 401 317 { 402 - struct bch_fs *c = container_of(j, struct bch_fs, journal); 318 + struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work); 403 319 struct btree_write_buffer *wb = &c->btree_write_buffer; 320 + int ret; 404 321 405 - mutex_lock(&wb->flush_lock); 406 - int ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans)); 407 - mutex_unlock(&wb->flush_lock); 322 + mutex_lock(&wb->flushing.lock); 323 + do { 324 + ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans)); 325 + } while (!ret && bch2_btree_write_buffer_should_flush(c)); 326 + mutex_unlock(&wb->flushing.lock); 408 327 409 - return ret; 328 + bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); 410 329 } 411 330 412 - static inline u64 btree_write_buffer_ref(int idx) 331 + int __bch2_journal_key_to_wb(struct bch_fs *c, 332 + struct journal_keys_to_wb *dst, 333 + enum btree_id btree, struct bkey_i *k) 413 334 { 414 - return ((union btree_write_buffer_state) { 415 - .ref0 = idx == 0, 416 - .ref1 = idx == 1, 417 - }).v; 418 - } 419 - 420 - int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans) 421 - { 422 - struct bch_fs *c = trans->c; 423 335 struct btree_write_buffer *wb = &c->btree_write_buffer; 424 - struct btree_write_buffered_key *i; 425 - union btree_write_buffer_state old, new; 426 - int ret = 0; 427 - u64 v; 336 + int ret; 337 + retry: 338 + ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL); 339 + if (!ret && dst->wb == &wb->flushing) 340 + ret = darray_resize(&wb->sorted, wb->flushing.keys.size); 428 341 429 - trans_for_each_wb_update(trans, i) { 430 - EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); 342 + if (unlikely(ret)) { 343 + if (dst->wb == &c->btree_write_buffer.flushing) { 344 + mutex_unlock(&dst->wb->lock); 345 + dst->wb = &c->btree_write_buffer.inc; 346 + bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin, 347 + bch2_btree_write_buffer_journal_flush); 348 + goto retry; 349 + } 431 350 432 - i->journal_seq = trans->journal_res.seq; 433 - i->journal_offset = trans->journal_res.offset; 351 + return ret; 434 352 } 435 353 436 - preempt_disable(); 437 - v = READ_ONCE(wb->state.v); 438 - do { 439 - old.v = new.v = v; 354 + dst->room = darray_room(dst->wb->keys); 355 + if (dst->wb == &wb->flushing) 356 + dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); 357 + BUG_ON(!dst->room); 358 + BUG_ON(!dst->seq); 440 359 441 - new.v += btree_write_buffer_ref(new.idx); 442 - new.nr += trans->nr_wb_updates; 443 - if (new.nr > wb->size) { 444 - ret = -BCH_ERR_btree_insert_need_flush_buffer; 445 - goto out; 360 + struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); 361 + wb_k->journal_seq = dst->seq; 362 + wb_k->btree = btree; 363 + bkey_copy(&wb_k->k, k); 364 + dst->wb->keys.nr++; 365 + dst->room--; 366 + return 0; 367 + } 368 + 369 + void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq) 370 + { 371 + struct btree_write_buffer *wb = &c->btree_write_buffer; 372 + 373 + if (mutex_trylock(&wb->flushing.lock)) { 374 + mutex_lock(&wb->inc.lock); 375 + move_keys_from_inc_to_flushing(wb); 376 + 377 + /* 378 + * Attempt to skip wb->inc, and add keys directly to 379 + * wb->flushing, saving us a copy later: 380 + */ 381 + 382 + if (!wb->inc.keys.nr) { 383 + dst->wb = &wb->flushing; 384 + } else { 385 + mutex_unlock(&wb->flushing.lock); 386 + dst->wb = &wb->inc; 446 387 } 447 - } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); 388 + } else { 389 + mutex_lock(&wb->inc.lock); 390 + dst->wb = &wb->inc; 391 + } 448 392 449 - memcpy(wb->keys[new.idx] + old.nr, 450 - trans->wb_updates, 451 - sizeof(trans->wb_updates[0]) * trans->nr_wb_updates); 393 + dst->room = darray_room(dst->wb->keys); 394 + if (dst->wb == &wb->flushing) 395 + dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); 396 + dst->seq = seq; 452 397 453 - bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin, 398 + bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin, 454 399 bch2_btree_write_buffer_journal_flush); 400 + } 455 401 456 - atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter); 402 + void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst) 403 + { 404 + struct btree_write_buffer *wb = &c->btree_write_buffer; 405 + 406 + if (!dst->wb->keys.nr) 407 + bch2_journal_pin_drop(&c->journal, &dst->wb->pin); 408 + 409 + if (bch2_btree_write_buffer_should_flush(c) && 410 + __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) && 411 + !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work)) 412 + bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); 413 + 414 + if (dst->wb == &wb->flushing) 415 + mutex_unlock(&wb->flushing.lock); 416 + mutex_unlock(&wb->inc.lock); 417 + } 418 + 419 + static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) 420 + { 421 + struct journal_keys_to_wb dst; 422 + struct jset_entry *entry; 423 + struct bkey_i *k; 424 + int ret = 0; 425 + 426 + bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); 427 + 428 + for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { 429 + jset_entry_for_each_key(entry, k) { 430 + ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); 431 + if (ret) 432 + goto out; 433 + } 434 + 435 + entry->type = BCH_JSET_ENTRY_btree_keys; 436 + } 437 + 438 + buf->need_flush_to_write_buffer = false; 457 439 out: 458 - preempt_enable(); 440 + bch2_journal_keys_to_write_buffer_end(c, &dst); 459 441 return ret; 442 + } 443 + 444 + static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size) 445 + { 446 + if (wb->keys.size >= new_size) 447 + return 0; 448 + 449 + if (!mutex_trylock(&wb->lock)) 450 + return -EINTR; 451 + 452 + int ret = darray_resize(&wb->keys, new_size); 453 + mutex_unlock(&wb->lock); 454 + return ret; 455 + } 456 + 457 + int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size) 458 + { 459 + struct btree_write_buffer *wb = &c->btree_write_buffer; 460 + 461 + return wb_keys_resize(&wb->flushing, new_size) ?: 462 + wb_keys_resize(&wb->inc, new_size); 460 463 } 461 464 462 465 void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) 463 466 { 464 467 struct btree_write_buffer *wb = &c->btree_write_buffer; 465 468 466 - BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal)); 469 + BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) && 470 + !bch2_journal_error(&c->journal)); 467 471 468 - kvfree(wb->keys[1]); 469 - kvfree(wb->keys[0]); 472 + darray_exit(&wb->sorted); 473 + darray_exit(&wb->flushing.keys); 474 + darray_exit(&wb->inc.keys); 470 475 } 471 476 472 477 int bch2_fs_btree_write_buffer_init(struct bch_fs *c) 473 478 { 474 479 struct btree_write_buffer *wb = &c->btree_write_buffer; 475 480 476 - mutex_init(&wb->flush_lock); 477 - wb->size = c->opts.btree_write_buffer_size; 481 + mutex_init(&wb->inc.lock); 482 + mutex_init(&wb->flushing.lock); 483 + INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work); 478 484 479 - wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL); 480 - wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL); 481 - if (!wb->keys[0] || !wb->keys[1]) 482 - return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init; 485 + /* Will be resized by journal as needed: */ 486 + unsigned initial_size = 1 << 16; 483 487 484 - return 0; 488 + return darray_make_room(&wb->inc.keys, initial_size) ?: 489 + darray_make_room(&wb->flushing.keys, initial_size) ?: 490 + darray_make_room(&wb->sorted, initial_size); 485 491 }
+49 -3
fs/bcachefs/btree_write_buffer.h
··· 2 2 #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H 3 3 #define _BCACHEFS_BTREE_WRITE_BUFFER_H 4 4 5 - int bch2_btree_write_buffer_flush_locked(struct btree_trans *); 6 - int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); 5 + #include "bkey.h" 6 + 7 + static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c) 8 + { 9 + struct btree_write_buffer *wb = &c->btree_write_buffer; 10 + 11 + return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4; 12 + } 13 + 14 + static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c) 15 + { 16 + struct btree_write_buffer *wb = &c->btree_write_buffer; 17 + 18 + return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4; 19 + } 20 + 21 + struct btree_trans; 7 22 int bch2_btree_write_buffer_flush_sync(struct btree_trans *); 23 + int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); 8 24 int bch2_btree_write_buffer_tryflush(struct btree_trans *); 9 25 10 - int bch2_btree_insert_keys_write_buffer(struct btree_trans *); 26 + struct journal_keys_to_wb { 27 + struct btree_write_buffer_keys *wb; 28 + size_t room; 29 + u64 seq; 30 + }; 11 31 32 + int __bch2_journal_key_to_wb(struct bch_fs *, 33 + struct journal_keys_to_wb *, 34 + enum btree_id, struct bkey_i *); 35 + 36 + static inline int bch2_journal_key_to_wb(struct bch_fs *c, 37 + struct journal_keys_to_wb *dst, 38 + enum btree_id btree, struct bkey_i *k) 39 + { 40 + EBUG_ON(!dst->seq); 41 + 42 + if (unlikely(!dst->room)) 43 + return __bch2_journal_key_to_wb(c, dst, btree, k); 44 + 45 + struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); 46 + wb_k->journal_seq = dst->seq; 47 + wb_k->btree = btree; 48 + bkey_copy(&wb_k->k, k); 49 + dst->wb->keys.nr++; 50 + dst->room--; 51 + return 0; 52 + } 53 + 54 + void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64); 55 + void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *); 56 + 57 + int bch2_btree_write_buffer_resize(struct bch_fs *, size_t); 12 58 void bch2_fs_btree_write_buffer_exit(struct bch_fs *); 13 59 int bch2_fs_btree_write_buffer_init(struct bch_fs *); 14 60
+38 -25
fs/bcachefs/btree_write_buffer_types.h
··· 2 2 #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H 3 3 #define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H 4 4 5 + #include "darray.h" 5 6 #include "journal_types.h" 6 7 7 8 #define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4 8 9 #define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX) 9 10 11 + struct wb_key_ref { 12 + union { 13 + struct { 14 + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 15 + unsigned idx:24; 16 + struct bpos pos; 17 + enum btree_id btree:8; 18 + #else 19 + enum btree_id btree:8; 20 + struct bpos pos; 21 + unsigned idx:24; 22 + #endif 23 + } __packed; 24 + struct { 25 + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 26 + u64 lo; 27 + u64 mi; 28 + u64 hi; 29 + #else 30 + u64 hi; 31 + u64 mi; 32 + u64 lo; 33 + #endif 34 + }; 35 + }; 36 + }; 37 + 10 38 struct btree_write_buffered_key { 11 - u64 journal_seq; 12 - unsigned journal_offset; 13 - enum btree_id btree; 39 + enum btree_id btree:8; 40 + u64 journal_seq:56; 14 41 __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX); 15 42 }; 16 43 17 - union btree_write_buffer_state { 18 - struct { 19 - atomic64_t counter; 20 - }; 21 - 22 - struct { 23 - u64 v; 24 - }; 25 - 26 - struct { 27 - u64 nr:23; 28 - u64 idx:1; 29 - u64 ref0:20; 30 - u64 ref1:20; 31 - }; 44 + struct btree_write_buffer_keys { 45 + DARRAY(struct btree_write_buffered_key) keys; 46 + struct journal_entry_pin pin; 47 + struct mutex lock; 32 48 }; 33 49 34 50 struct btree_write_buffer { 35 - struct mutex flush_lock; 36 - struct journal_entry_pin journal_pin; 37 - 38 - union btree_write_buffer_state state; 39 - size_t size; 40 - 41 - struct btree_write_buffered_key *keys[2]; 51 + DARRAY(struct wb_key_ref) sorted; 52 + struct btree_write_buffer_keys inc; 53 + struct btree_write_buffer_keys flushing; 54 + struct work_struct flush_work; 42 55 }; 43 56 44 57 #endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
+1 -1
fs/bcachefs/ec.c
··· 1005 1005 unsigned i, nr_data = v->nr_blocks - v->nr_redundant; 1006 1006 int ret = 0; 1007 1007 1008 - ret = bch2_btree_write_buffer_flush_nocheck_rw(trans); 1008 + ret = bch2_btree_write_buffer_flush_sync(trans); 1009 1009 if (ret) 1010 1010 goto err; 1011 1011
-1
fs/bcachefs/errcode.h
··· 151 151 x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \ 152 152 x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \ 153 153 x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \ 154 - x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \ 155 154 x(0, backpointer_to_overwritten_btree_node) \ 156 155 x(0, lock_fail_root_changed) \ 157 156 x(0, journal_reclaim_would_deadlock) \
+44
fs/bcachefs/journal.c
··· 10 10 #include "bkey_methods.h" 11 11 #include "btree_gc.h" 12 12 #include "btree_update.h" 13 + #include "btree_write_buffer.h" 13 14 #include "buckets.h" 14 15 #include "error.h" 15 16 #include "journal.h" ··· 333 332 buf->must_flush = false; 334 333 buf->separate_flush = false; 335 334 buf->flush_time = 0; 335 + buf->need_flush_to_write_buffer = true; 336 336 337 337 memset(buf->data, 0, sizeof(*buf->data)); 338 338 buf->data->seq = cpu_to_le64(journal_cur_seq(j)); ··· 768 766 spin_unlock(&j->lock); 769 767 770 768 journal_quiesce(j); 769 + } 770 + 771 + static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) 772 + { 773 + struct journal_buf *ret = NULL; 774 + 775 + mutex_lock(&j->buf_lock); 776 + spin_lock(&j->lock); 777 + max_seq = min(max_seq, journal_cur_seq(j)); 778 + 779 + for (u64 seq = journal_last_unwritten_seq(j); 780 + seq <= max_seq; 781 + seq++) { 782 + unsigned idx = seq & JOURNAL_BUF_MASK; 783 + struct journal_buf *buf = j->buf + idx; 784 + 785 + if (buf->need_flush_to_write_buffer) { 786 + if (seq == journal_cur_seq(j)) 787 + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); 788 + 789 + union journal_res_state s; 790 + s.v = atomic64_read_acquire(&j->reservations.counter); 791 + 792 + ret = journal_state_count(s, idx) 793 + ? ERR_PTR(-EAGAIN) 794 + : buf; 795 + break; 796 + } 797 + } 798 + 799 + spin_unlock(&j->lock); 800 + if (IS_ERR_OR_NULL(ret)) 801 + mutex_unlock(&j->buf_lock); 802 + return ret; 803 + } 804 + 805 + struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) 806 + { 807 + struct journal_buf *ret; 808 + 809 + wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN)); 810 + return ret; 771 811 } 772 812 773 813 /* allocate journal on a device: */
+1
fs/bcachefs/journal.h
··· 425 425 426 426 void bch2_journal_unblock(struct journal *); 427 427 void bch2_journal_block(struct journal *); 428 + struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq); 428 429 429 430 void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); 430 431 void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
+53 -5
fs/bcachefs/journal_io.c
··· 4 4 #include "alloc_foreground.h" 5 5 #include "btree_io.h" 6 6 #include "btree_update_interior.h" 7 + #include "btree_write_buffer.h" 7 8 #include "buckets.h" 8 9 #include "checksum.h" 9 10 #include "disk_groups.h" ··· 719 718 } 720 719 721 720 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, 721 + struct jset_entry *entry) 722 + { 723 + journal_entry_btree_keys_to_text(out, c, entry); 724 + } 725 + 726 + static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, 727 + struct jset *jset, 728 + struct jset_entry *entry, 729 + unsigned version, int big_endian, 730 + enum bkey_invalid_flags flags) 731 + { 732 + return journal_entry_btree_keys_validate(c, jset, entry, 733 + version, big_endian, READ); 734 + } 735 + 736 + static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, 722 737 struct jset_entry *entry) 723 738 { 724 739 journal_entry_btree_keys_to_text(out, c, entry); ··· 1520 1503 1521 1504 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) 1522 1505 { 1506 + struct bch_fs *c = container_of(j, struct bch_fs, journal); 1507 + 1523 1508 /* we aren't holding j->lock: */ 1524 1509 unsigned new_size = READ_ONCE(j->buf_size_want); 1525 1510 void *new_buf; 1526 1511 1527 1512 if (buf->buf_size >= new_size) 1513 + return; 1514 + 1515 + size_t btree_write_buffer_size = new_size / 64; 1516 + 1517 + if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) 1528 1518 return; 1529 1519 1530 1520 new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); ··· 1727 1703 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1728 1704 struct jset_entry *start, *end, *i, *next, *prev = NULL; 1729 1705 struct jset *jset = w->data; 1706 + struct journal_keys_to_wb wb = { NULL }; 1730 1707 unsigned sectors, bytes, u64s; 1731 - bool validate_before_checksum = false; 1732 1708 unsigned long btree_roots_have = 0; 1709 + bool validate_before_checksum = false; 1710 + u64 seq = le64_to_cpu(jset->seq); 1733 1711 int ret; 1734 1712 1735 1713 /* ··· 1759 1733 * to c->btree_roots we have to get any missing btree roots and 1760 1734 * add them to this journal entry: 1761 1735 */ 1762 - if (i->type == BCH_JSET_ENTRY_btree_root) { 1736 + switch (i->type) { 1737 + case BCH_JSET_ENTRY_btree_root: 1763 1738 bch2_journal_entry_to_btree_root(c, i); 1764 1739 __set_bit(i->btree_id, &btree_roots_have); 1740 + break; 1741 + case BCH_JSET_ENTRY_write_buffer_keys: 1742 + EBUG_ON(!w->need_flush_to_write_buffer); 1743 + 1744 + if (!wb.wb) 1745 + bch2_journal_keys_to_write_buffer_start(c, &wb, seq); 1746 + 1747 + struct bkey_i *k; 1748 + jset_entry_for_each_key(i, k) { 1749 + ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); 1750 + if (ret) { 1751 + bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer"); 1752 + bch2_journal_keys_to_write_buffer_end(c, &wb); 1753 + return ret; 1754 + } 1755 + } 1756 + i->type = BCH_JSET_ENTRY_btree_keys; 1757 + break; 1765 1758 } 1766 1759 1767 1760 /* Can we merge with previous entry? */ ··· 1803 1758 memmove_u64s_down(prev, i, jset_u64s(u64s)); 1804 1759 } 1805 1760 1761 + if (wb.wb) 1762 + bch2_journal_keys_to_write_buffer_end(c, &wb); 1763 + w->need_flush_to_write_buffer = false; 1764 + 1806 1765 prev = prev ? vstruct_next(prev) : jset->start; 1807 1766 jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); 1808 1767 ··· 1814 1765 1815 1766 end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); 1816 1767 1817 - bch2_journal_super_entries_add_common(c, &end, 1818 - le64_to_cpu(jset->seq)); 1768 + bch2_journal_super_entries_add_common(c, &end, seq); 1819 1769 u64s = (u64 *) end - (u64 *) start; 1820 1770 BUG_ON(u64s > j->entry_u64s_reserved); 1821 1771 ··· 1837 1789 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); 1838 1790 1839 1791 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) 1840 - j->last_empty_seq = le64_to_cpu(jset->seq); 1792 + j->last_empty_seq = seq; 1841 1793 1842 1794 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) 1843 1795 validate_before_checksum = true;
+8 -4
fs/bcachefs/journal_reclaim.c
··· 3 3 #include "bcachefs.h" 4 4 #include "btree_key_cache.h" 5 5 #include "btree_update.h" 6 + #include "btree_write_buffer.h" 6 7 #include "buckets.h" 7 8 #include "errcode.h" 8 9 #include "error.h" ··· 51 50 return available; 52 51 } 53 52 54 - static inline void journal_set_watermark(struct journal *j) 53 + void bch2_journal_set_watermark(struct journal *j) 55 54 { 56 55 struct bch_fs *c = container_of(j, struct bch_fs, journal); 57 56 bool low_on_space = j->space[journal_space_clean].total * 4 <= 58 57 j->space[journal_space_total].total; 59 58 bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4; 60 - unsigned watermark = low_on_space || low_on_pin 59 + bool low_on_wb = bch2_btree_write_buffer_must_wait(c); 60 + unsigned watermark = low_on_space || low_on_pin || low_on_wb 61 61 ? BCH_WATERMARK_reclaim 62 62 : BCH_WATERMARK_stripe; 63 63 64 64 if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], 65 65 &j->low_on_space_start, low_on_space) || 66 66 track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], 67 - &j->low_on_pin_start, low_on_pin)) 67 + &j->low_on_pin_start, low_on_pin) || 68 + track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], 69 + &j->write_buffer_full_start, low_on_wb)) 68 70 trace_and_count(c, journal_full, c); 69 71 70 72 swap(watermark, j->watermark); ··· 234 230 else 235 231 clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); 236 232 237 - journal_set_watermark(j); 233 + bch2_journal_set_watermark(j); 238 234 out: 239 235 j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; 240 236 j->cur_entry_error = ret;
+1
fs/bcachefs/journal_reclaim.h
··· 16 16 unsigned bch2_journal_dev_buckets_available(struct journal *, 17 17 struct journal_device *, 18 18 enum journal_space_from); 19 + void bch2_journal_set_watermark(struct journal *); 19 20 void bch2_journal_space_available(struct journal *); 20 21 21 22 static inline bool journal_pin_active(struct journal_entry_pin *pin)
+2
fs/bcachefs/journal_types.h
··· 36 36 bool noflush; /* write has already been kicked off, and was noflush */ 37 37 bool must_flush; /* something wants a flush */ 38 38 bool separate_flush; 39 + bool need_flush_to_write_buffer; 39 40 }; 40 41 41 42 /* ··· 277 276 u64 low_on_space_start; 278 277 u64 low_on_pin_start; 279 278 u64 max_in_flight_start; 279 + u64 write_buffer_full_start; 280 280 281 281 struct bch2_time_stats *flush_write_time; 282 282 struct bch2_time_stats *noflush_write_time;
-5
fs/bcachefs/opts.h
··· 233 233 OPT_BOOL(), \ 234 234 BCH2_NO_SB_OPT, true, \ 235 235 NULL, "Stash pointer to in memory btree node in btree ptr")\ 236 - x(btree_write_buffer_size, u32, \ 237 - OPT_FS|OPT_MOUNT, \ 238 - OPT_UINT(16, (1U << 20) - 1), \ 239 - BCH2_NO_SB_OPT, 1U << 13, \ 240 - NULL, "Number of btree write buffer entries") \ 241 236 x(gc_reserve_percent, u8, \ 242 237 OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ 243 238 OPT_UINT(5, 21), \
+2 -1
fs/bcachefs/super.c
··· 363 363 BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); 364 364 BUG_ON(atomic_read(&c->btree_cache.dirty)); 365 365 BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); 366 - BUG_ON(c->btree_write_buffer.state.nr); 366 + BUG_ON(c->btree_write_buffer.inc.keys.nr); 367 + BUG_ON(c->btree_write_buffer.flushing.keys.nr); 367 368 368 369 bch_verbose(c, "marking filesystem clean"); 369 370 bch2_fs_mark_clean(c);