Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bcachefs: Simplify journal replay

With BTREE_ITER_WITH_JOURNAL, there's no longer any restrictions on the
order we have to replay keys from the journal in, and we can also start
up journal reclaim right away - and delete a bunch of code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>

authored by

Kent Overstreet and committed by
Kent Overstreet
d8601afc 8e432d98

+22 -114
+1 -2
fs/bcachefs/alloc_background.c
··· 902 902 static bool allocator_thread_running(struct bch_dev *ca) 903 903 { 904 904 unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw && 905 - test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) && 906 - test_bit(BCH_FS_ALLOC_REPLAY_DONE, &ca->fs->flags) 905 + test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) 907 906 ? ALLOCATOR_running 908 907 : ALLOCATOR_stopped; 909 908 alloc_thread_set_state(ca, state);
-2
fs/bcachefs/bcachefs.h
··· 510 510 BCH_FS_INITIAL_GC_DONE, 511 511 BCH_FS_INITIAL_GC_UNFIXED, 512 512 BCH_FS_TOPOLOGY_REPAIR_DONE, 513 - BCH_FS_ALLOC_REPLAY_DONE, 514 - BCH_FS_BTREE_INTERIOR_REPLAY_DONE, 515 513 BCH_FS_FSCK_DONE, 516 514 BCH_FS_STARTED, 517 515 BCH_FS_RW,
+1 -2
fs/bcachefs/btree_key_cache.h
··· 16 16 size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); 17 17 size_t max_dirty = 4096 + (nr_keys * 3) / 4; 18 18 19 - return nr_dirty > max_dirty && 20 - test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); 19 + return nr_dirty > max_dirty; 21 20 } 22 21 23 22 int bch2_btree_key_cache_journal_flush(struct journal *,
+1 -4
fs/bcachefs/btree_update_interior.c
··· 45 45 46 46 BUG_ON(!b->c.level); 47 47 48 - if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) 48 + if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) 49 49 return; 50 50 51 51 bch2_btree_node_iter_init_from_start(&iter, b); ··· 1850 1850 void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) 1851 1851 { 1852 1852 struct async_btree_rewrite *a; 1853 - 1854 - if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) 1855 - return; 1856 1853 1857 1854 if (!percpu_ref_tryget(&c->writes)) 1858 1855 return;
-3
fs/bcachefs/btree_update_leaf.c
··· 206 206 int old_live_u64s = b->nr.live_u64s; 207 207 int live_u64s_added, u64s_added; 208 208 209 - EBUG_ON(!insert->level && 210 - !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)); 211 - 212 209 if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b, 213 210 &insert_l(insert)->iter, insert->k))) 214 211 return false;
-5
fs/bcachefs/journal_reclaim.c
··· 489 489 u64 seq; 490 490 int err; 491 491 492 - if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)) 493 - return 0; 494 - 495 492 lockdep_assert_held(&j->reclaim_lock); 496 493 497 494 while (1) { ··· 685 688 int ret = 0; 686 689 687 690 set_freezable(); 688 - 689 - kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)); 690 691 691 692 j->last_flushed = jiffies; 692 693
-1
fs/bcachefs/journal_types.h
··· 148 148 enum { 149 149 JOURNAL_REPLAY_DONE, 150 150 JOURNAL_STARTED, 151 - JOURNAL_RECLAIM_STARTED, 152 151 JOURNAL_NEED_WRITE, 153 152 JOURNAL_MAY_GET_UNRESERVED, 154 153 JOURNAL_MAY_SKIP_FLUSH,
+19 -95
fs/bcachefs/recovery.c
··· 474 474 bch2_journal_pin_put(j, j->replay_journal_seq++); 475 475 } 476 476 477 - static int __bch2_journal_replay_key(struct btree_trans *trans, 478 - struct journal_key *k) 477 + static int bch2_journal_replay_key(struct btree_trans *trans, 478 + struct journal_key *k) 479 479 { 480 480 struct btree_iter iter; 481 481 unsigned iter_flags = ··· 484 484 int ret; 485 485 486 486 if (!k->level && k->btree_id == BTREE_ID_alloc) 487 - iter_flags |= BTREE_ITER_CACHED|BTREE_ITER_CACHED_NOFILL; 487 + iter_flags |= BTREE_ITER_CACHED; 488 488 489 489 bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, 490 490 BTREE_MAX_DEPTH, k->level, ··· 503 503 return ret; 504 504 } 505 505 506 - static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k) 507 - { 508 - unsigned commit_flags = 509 - BTREE_INSERT_LAZY_RW| 510 - BTREE_INSERT_NOFAIL| 511 - BTREE_INSERT_JOURNAL_RESERVED; 512 - 513 - if (!k->allocated) 514 - commit_flags |= BTREE_INSERT_JOURNAL_REPLAY; 515 - 516 - return bch2_trans_do(c, NULL, NULL, commit_flags, 517 - __bch2_journal_replay_key(&trans, k)); 518 - } 519 - 520 506 static int journal_sort_seq_cmp(const void *_l, const void *_r) 521 507 { 522 508 const struct journal_key *l = *((const struct journal_key **)_l); 523 509 const struct journal_key *r = *((const struct journal_key **)_r); 524 510 525 - return cmp_int(r->level, l->level) ?: 526 - cmp_int(l->journal_seq, r->journal_seq) ?: 527 - cmp_int(l->btree_id, r->btree_id) ?: 528 - bpos_cmp(l->k->k.p, r->k->k.p); 511 + return cmp_int(l->journal_seq, r->journal_seq); 529 512 } 530 513 531 514 static int bch2_journal_replay(struct bch_fs *c) ··· 516 533 struct journal_keys *keys = &c->journal_keys; 517 534 struct journal_key **keys_sorted, *k; 518 535 struct journal *j = &c->journal; 519 - struct bch_dev *ca; 520 - unsigned idx; 521 536 size_t i; 522 - u64 seq; 523 537 int ret; 524 538 525 539 keys_sorted = kmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL); ··· 535 555 replay_now_at(j, keys->journal_seq_base); 536 556 } 537 557 538 - seq = j->replay_journal_seq; 539 - 540 - /* 541 - * First replay updates to the alloc btree - these will only update the 542 - * btree key cache: 543 - */ 544 558 for (i = 0; i < keys->nr; i++) { 545 559 k = keys_sorted[i]; 546 560 547 561 cond_resched(); 548 562 549 - if (!k->level && k->btree_id == BTREE_ID_alloc) { 550 - j->replay_journal_seq = keys->journal_seq_base + k->journal_seq; 551 - ret = bch2_journal_replay_key(c, k); 552 - if (ret) 553 - goto err; 554 - } 555 - } 563 + if (!k->allocated) 564 + replay_now_at(j, keys->journal_seq_base + k->journal_seq); 556 565 557 - /* Now we can start the allocator threads: */ 558 - set_bit(BCH_FS_ALLOC_REPLAY_DONE, &c->flags); 559 - for_each_member_device(ca, c, idx) 560 - bch2_wake_allocator(ca); 561 - 562 - /* 563 - * Next replay updates to interior btree nodes: 564 - */ 565 - for (i = 0; i < keys->nr; i++) { 566 - k = keys_sorted[i]; 567 - 568 - cond_resched(); 569 - 570 - if (k->level) { 571 - j->replay_journal_seq = keys->journal_seq_base + k->journal_seq; 572 - ret = bch2_journal_replay_key(c, k); 573 - if (ret) 574 - goto err; 575 - } 576 - } 577 - 578 - /* 579 - * Now that the btree is in a consistent state, we can start journal 580 - * reclaim (which will be flushing entries from the btree key cache back 581 - * to the btree: 582 - */ 583 - set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); 584 - set_bit(JOURNAL_RECLAIM_STARTED, &j->flags); 585 - journal_reclaim_kick(j); 586 - 587 - j->replay_journal_seq = seq; 588 - 589 - /* 590 - * Now replay leaf node updates: 591 - */ 592 - for (i = 0; i < keys->nr; i++) { 593 - k = keys_sorted[i]; 594 - 595 - cond_resched(); 596 - 597 - if (k->level || k->btree_id == BTREE_ID_alloc) 598 - continue; 599 - 600 - replay_now_at(j, keys->journal_seq_base + k->journal_seq); 601 - 602 - ret = bch2_journal_replay_key(c, k); 603 - if (ret) 566 + ret = bch2_trans_do(c, NULL, NULL, 567 + BTREE_INSERT_LAZY_RW| 568 + BTREE_INSERT_NOFAIL| 569 + BTREE_INSERT_JOURNAL_RESERVED| 570 + (!k->allocated ? BTREE_INSERT_JOURNAL_REPLAY : 0), 571 + bch2_journal_replay_key(&trans, k)); 572 + if (ret) { 573 + bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", 574 + ret, bch2_btree_ids[k->btree_id], k->level); 604 575 goto err; 576 + } 605 577 } 606 578 607 579 replay_now_at(j, j->replay_journal_seq_end); ··· 561 629 562 630 bch2_journal_set_replay_done(j); 563 631 bch2_journal_flush_all_pins(j); 564 - kfree(keys_sorted); 565 - 566 - return bch2_journal_error(j); 632 + ret = bch2_journal_error(j); 567 633 err: 568 - bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", 569 - ret, bch2_btree_ids[k->btree_id], k->level); 570 634 kfree(keys_sorted); 571 - 572 635 return ret; 573 636 } 574 637 ··· 1142 1215 ret = bch2_journal_replay(c); 1143 1216 if (ret) 1144 1217 goto err; 1145 - bch_verbose(c, "journal replay done"); 1218 + if (c->opts.verbose || !c->sb.clean) 1219 + bch_info(c, "journal replay done"); 1146 1220 1147 1221 if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) && 1148 1222 !c->opts.nochanges) { ··· 1312 1384 1313 1385 for (i = 0; i < BTREE_ID_NR; i++) 1314 1386 bch2_btree_root_alloc(c, i); 1315 - 1316 - set_bit(BCH_FS_ALLOC_REPLAY_DONE, &c->flags); 1317 - set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); 1318 - set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); 1319 1387 1320 1388 err = "unable to allocate journal buckets"; 1321 1389 for_each_online_member(ca, c, i) {