Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bcachefs: Pin btree cache in ram for random access in fsck

Various phases of fsck involve checking references from one btree to
another: this means doing a sequential scan of one btree, and then
mostly random access into the second.

This is particularly painful for checking extents <-> backpointers; we
can prefetch btree node access on the sequential scan, but not on the
random access portion, and this is particularly painful on spinning
rust, where we'd like to keep the pipeline fairly full of btree node
reads so that the elevator can reduce seeking.

This patch implements prefetching and pinning of the portion of the
btree that we'll be doing random access to. We already calculate how
much of the random access btree will fit in memory so it's a fairly
straightforward change.

This will put more pressure on system memory usage, so we introduce a
new option, fsck_memory_usage_percent, which is the percentage of total
system ram that fsck is allowed to pin.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

+72 -91
+47 -90
fs/bcachefs/backpointers.c
··· 554 554 }; 555 555 } 556 556 557 - static size_t btree_nodes_fit_in_ram(struct bch_fs *c) 557 + static u64 mem_may_pin_bytes(struct bch_fs *c) 558 558 { 559 559 struct sysinfo i; 560 - u64 mem_bytes; 561 - 562 560 si_meminfo(&i); 563 - mem_bytes = i.totalram * i.mem_unit; 564 - return div_u64(mem_bytes >> 1, c->opts.btree_node_size); 561 + 562 + u64 mem_bytes = i.totalram * i.mem_unit; 563 + return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100); 564 + } 565 + 566 + static size_t btree_nodes_fit_in_ram(struct bch_fs *c) 567 + { 568 + return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size); 565 569 } 566 570 567 571 static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, 568 - unsigned btree_leaf_mask, 569 - unsigned btree_interior_mask, 572 + u64 btree_leaf_mask, 573 + u64 btree_interior_mask, 570 574 struct bbpos start, struct bbpos *end) 571 575 { 572 - struct btree_iter iter; 573 - struct bkey_s_c k; 574 - size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); 575 - enum btree_id btree; 576 + struct bch_fs *c = trans->c; 577 + s64 mem_may_pin = mem_may_pin_bytes(c); 576 578 int ret = 0; 577 579 578 - for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) { 579 - unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2; 580 + btree_interior_mask |= btree_leaf_mask; 581 + 582 + c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask; 583 + c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask; 584 + c->btree_cache.pinned_nodes_start = start; 585 + c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX; 586 + 587 + for (enum btree_id btree = start.btree; 588 + btree < BTREE_ID_NR && !ret; 589 + btree++) { 590 + unsigned depth = ((1U << btree) & btree_leaf_mask) ? 0 : 1; 591 + struct btree_iter iter; 592 + struct btree *b; 580 593 581 594 if (!((1U << btree) & btree_leaf_mask) && 582 595 !((1U << btree) & btree_interior_mask)) 583 596 continue; 584 597 585 - bch2_trans_node_iter_init(trans, &iter, btree, 586 - btree == start.btree ? start.pos : POS_MIN, 587 - 0, depth, 0); 588 - /* 589 - * for_each_btree_key_contineu() doesn't check the return value 590 - * from bch2_btree_iter_advance(), which is needed when 591 - * iterating over interior nodes where we'll see keys at 592 - * SPOS_MAX: 593 - */ 594 - do { 595 - k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0); 596 - ret = bkey_err(k); 597 - if (!k.k || ret) 598 - break; 599 - 600 - --btree_nodes; 601 - if (!btree_nodes) { 602 - *end = BBPOS(btree, k.k->p); 598 + __for_each_btree_node(trans, iter, btree, 599 + btree == start.btree ? start.pos : POS_MIN, 600 + 0, depth, BTREE_ITER_PREFETCH, b, ret) { 601 + mem_may_pin -= btree_buf_bytes(b); 602 + if (mem_may_pin <= 0) { 603 + c->btree_cache.pinned_nodes_end = *end = 604 + BBPOS(btree, b->key.k.p); 603 605 bch2_trans_iter_exit(trans, &iter); 604 606 return 0; 605 607 } 606 - } while (bch2_btree_iter_advance(&iter)); 608 + } 607 609 bch2_trans_iter_exit(trans, &iter); 608 610 } 609 611 610 - *end = BBPOS_MAX; 611 612 return ret; 612 613 } 613 614 ··· 666 665 return 0; 667 666 } 668 667 669 - static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c, 670 - struct bpos bucket) 671 - { 672 - return bch2_dev_exists2(c, bucket.inode) 673 - ? bucket_pos_to_bp(c, bucket, 0) 674 - : bucket; 675 - } 676 - 677 - static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, 678 - struct bpos start, struct bpos *end) 679 - { 680 - struct btree_iter alloc_iter; 681 - struct btree_iter bp_iter; 682 - struct bkey_s_c alloc_k, bp_k; 683 - size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); 684 - bool alloc_end = false, bp_end = false; 685 - int ret = 0; 686 - 687 - bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc, 688 - start, 0, 1, 0); 689 - bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers, 690 - bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0); 691 - while (1) { 692 - alloc_k = !alloc_end 693 - ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0) 694 - : bkey_s_c_null; 695 - bp_k = !bp_end 696 - ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0) 697 - : bkey_s_c_null; 698 - 699 - ret = bkey_err(alloc_k) ?: bkey_err(bp_k); 700 - if ((!alloc_k.k && !bp_k.k) || ret) { 701 - *end = SPOS_MAX; 702 - break; 703 - } 704 - 705 - --btree_nodes; 706 - if (!btree_nodes) { 707 - *end = alloc_k.k ? alloc_k.k->p : SPOS_MAX; 708 - break; 709 - } 710 - 711 - if (bpos_lt(alloc_iter.pos, SPOS_MAX) && 712 - bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) { 713 - if (!bch2_btree_iter_advance(&alloc_iter)) 714 - alloc_end = true; 715 - } else { 716 - if (!bch2_btree_iter_advance(&bp_iter)) 717 - bp_end = true; 718 - } 719 - } 720 - bch2_trans_iter_exit(trans, &bp_iter); 721 - bch2_trans_iter_exit(trans, &alloc_iter); 722 - return ret; 723 - } 724 - 725 668 int bch2_check_extents_to_backpointers(struct bch_fs *c) 726 669 { 727 670 struct btree_trans *trans = bch2_trans_get(c); ··· 676 731 bkey_init(&s.last_flushed.k->k); 677 732 678 733 while (1) { 679 - ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end); 734 + struct bbpos end; 735 + ret = bch2_get_btree_in_memory_pos(trans, 736 + BIT_ULL(BTREE_ID_backpointers), 737 + BIT_ULL(BTREE_ID_backpointers), 738 + BBPOS(BTREE_ID_backpointers, s.bucket_start), &end); 680 739 if (ret) 681 740 break; 741 + 742 + s.bucket_end = end.pos; 682 743 683 744 if ( bpos_eq(s.bucket_start, POS_MIN) && 684 745 !bpos_eq(s.bucket_end, SPOS_MAX)) ··· 712 761 } 713 762 bch2_trans_put(trans); 714 763 bch2_bkey_buf_exit(&s.last_flushed, c); 764 + 765 + c->btree_cache.pinned_nodes_leaf_mask = 0; 766 + c->btree_cache.pinned_nodes_interior_mask = 0; 715 767 716 768 bch_err_fn(c, ret); 717 769 return ret; ··· 820 866 start = bbpos_successor(end); 821 867 } 822 868 bch2_trans_put(trans); 869 + 870 + c->btree_cache.pinned_nodes_leaf_mask = 0; 871 + c->btree_cache.pinned_nodes_interior_mask = 0; 823 872 824 873 bch_err_fn(c, ret); 825 874 return ret;
+1 -1
fs/bcachefs/bbpos_types.h
··· 13 13 } 14 14 15 15 #define BBPOS_MIN BBPOS(0, POS_MIN) 16 - #define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX) 16 + #define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, SPOS_MAX) 17 17 18 18 #endif /* _BCACHEFS_BBPOS_TYPES_H */
+13
fs/bcachefs/btree_cache.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 3 3 #include "bcachefs.h" 4 + #include "bbpos.h" 4 5 #include "bkey_buf.h" 5 6 #include "btree_cache.h" 6 7 #include "btree_io.h" ··· 209 208 int ret = 0; 210 209 211 210 lockdep_assert_held(&bc->lock); 211 + 212 + struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); 213 + 214 + u64 mask = b->c.level 215 + ? bc->pinned_nodes_interior_mask 216 + : bc->pinned_nodes_leaf_mask; 217 + 218 + if ((mask & BIT_ULL(b->c.btree_id)) && 219 + bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && 220 + bbpos_cmp(bc->pinned_nodes_end, pos) >= 0) 221 + return -BCH_ERR_ENOMEM_btree_node_reclaim; 222 + 212 223 wait_on_io: 213 224 if (b->flags & ((1U << BTREE_NODE_dirty)| 214 225 (1U << BTREE_NODE_read_in_flight)|
+6
fs/bcachefs/btree_types.h
··· 5 5 #include <linux/list.h> 6 6 #include <linux/rhashtable.h> 7 7 8 + #include "bbpos_types.h" 8 9 #include "btree_key_cache_types.h" 9 10 #include "buckets_types.h" 10 11 #include "darray.h" ··· 174 173 */ 175 174 struct task_struct *alloc_lock; 176 175 struct closure_waitlist alloc_wait; 176 + 177 + struct bbpos pinned_nodes_start; 178 + struct bbpos pinned_nodes_end; 179 + u64 pinned_nodes_leaf_mask; 180 + u64 pinned_nodes_interior_mask; 177 181 }; 178 182 179 183 struct btree_node_iter {
+5
fs/bcachefs/opts.h
··· 337 337 OPT_BOOL(), \ 338 338 BCH2_NO_SB_OPT, false, \ 339 339 NULL, "Run fsck on mount") \ 340 + x(fsck_memory_usage_percent, u8, \ 341 + OPT_FS|OPT_MOUNT, \ 342 + OPT_UINT(20, 70), \ 343 + BCH2_NO_SB_OPT, 50, \ 344 + NULL, "Maximum percentage of system ram fsck is allowed to pin")\ 340 345 x(fix_errors, u8, \ 341 346 OPT_FS|OPT_MOUNT, \ 342 347 OPT_FN(bch2_opt_fix_errors), \