Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'bcachefs-2024-04-03' of https://evilpiepirate.org/git/bcachefs

Pull bcachefs repair code from Kent Overstreet:
"A couple more small fixes, and new repair code.

We can now automatically recover from arbitrary corrupted interior
btree nodes by scanning, and we can reconstruct metadata as needed to
bring a filesystem back into a working, consistent, read-write state
and preserve access to whatevver wasn't corrupted.

Meaning - you can blow away all metadata except for extents and
dirents leaf nodes, and repair will reconstruct everything else and
give you your data, and under the correct paths. If inodes are missing
i_size will be slightly off and permissions/ownership/timestamps will
be gone, and we do still need the snapshots btree if snapshots were in
use - in the future we'll be able to guess the snapshot tree structure
in some situations.

IOW - aside from shaking out remaining bugs (fuzz testing is still
coming), repair code should be complete and if repair ever doesn't
work that's the highest priority bug that I want to know about
immediately.

This patchset was kindly tested by a user from India who accidentally
wiped one drive out of a three drive filesystem with no replication on
the family computer - it took a couple weeks but we got everything
important back"

* tag 'bcachefs-2024-04-03' of https://evilpiepirate.org/git/bcachefs:
bcachefs: reconstruct_inode()
bcachefs: Subvolume reconstruction
bcachefs: Check for extents that point to same space
bcachefs: Reconstruct missing snapshot nodes
bcachefs: Flag btrees with missing data
bcachefs: Topology repair now uses nodes found by scanning to fill holes
bcachefs: Repair pass for scanning for btree nodes
bcachefs: Don't skip fake btree roots in fsck
bcachefs: bch2_btree_root_alloc() -> bch2_btree_root_alloc_fake()
bcachefs: Etyzinger cleanups
bcachefs: bch2_shoot_down_journal_keys()
bcachefs: Clear recovery_passes_required as they complete without errors
bcachefs: ratelimit informational fsck errors
bcachefs: Check for bad needs_discard before doing discard
bcachefs: Improve bch2_btree_update_to_text()
mean_and_variance: Drop always failing tests
bcachefs: fix nocow lock deadlock
bcachefs: BCH_WATERMARK_interior_updates
bcachefs: Fix btree node reserve

+1867 -492
+2
fs/bcachefs/Makefile
··· 17 17 btree_journal_iter.o \ 18 18 btree_key_cache.o \ 19 19 btree_locking.o \ 20 + btree_node_scan.o \ 20 21 btree_trans_commit.o \ 21 22 btree_update.o \ 22 23 btree_update_interior.o \ ··· 38 37 error.o \ 39 38 extents.o \ 40 39 extent_update.o \ 40 + eytzinger.o \ 41 41 fs.o \ 42 42 fs-common.o \ 43 43 fs-ioctl.o \
+26 -21
fs/bcachefs/alloc_background.c
··· 1713 1713 if (ret) 1714 1714 goto out; 1715 1715 1716 - if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { 1717 - a->v.gen++; 1718 - SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); 1719 - goto write; 1720 - } 1721 - 1722 - if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { 1723 - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { 1724 - bch2_trans_inconsistent(trans, 1725 - "clearing need_discard but journal_seq %llu > flushed_seq %llu\n" 1726 - "%s", 1727 - a->v.journal_seq, 1728 - c->journal.flushed_seq_ondisk, 1729 - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 1716 + if (a->v.dirty_sectors) { 1717 + if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, 1718 + trans, "attempting to discard bucket with dirty data\n%s", 1719 + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1730 1720 ret = -EIO; 1731 - } 1732 1721 goto out; 1733 1722 } 1734 1723 1735 1724 if (a->v.data_type != BCH_DATA_need_discard) { 1736 - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { 1737 - bch2_trans_inconsistent(trans, 1738 - "bucket incorrectly set in need_discard btree\n" 1739 - "%s", 1740 - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 1741 - ret = -EIO; 1725 + if (data_type_is_empty(a->v.data_type) && 1726 + BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { 1727 + a->v.gen++; 1728 + SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); 1729 + goto write; 1742 1730 } 1743 1731 1732 + if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, 1733 + trans, "bucket incorrectly set in need_discard btree\n" 1734 + "%s", 1735 + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1736 + ret = -EIO; 1737 + goto out; 1738 + } 1739 + 1740 + if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { 1741 + if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, 1742 + trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s", 1743 + a->v.journal_seq, 1744 + c->journal.flushed_seq_ondisk, 1745 + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1746 + ret = -EIO; 1744 1747 goto out; 1745 1748 } 1746 1749 ··· 1838 1835 if (ret) 1839 1836 goto err; 1840 1837 1838 + BUG_ON(a->v.dirty_sectors); 1841 1839 SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); 1842 1840 a->v.data_type = alloc_data_type(a->v, a->v.data_type); 1843 1841 ··· 1946 1942 goto out; 1947 1943 1948 1944 BUG_ON(a->v.data_type != BCH_DATA_cached); 1945 + BUG_ON(a->v.dirty_sectors); 1949 1946 1950 1947 if (!a->v.cached_sectors) 1951 1948 bch_err(c, "invalidating empty bucket, confused");
+3 -1
fs/bcachefs/alloc_foreground.c
··· 188 188 static inline unsigned open_buckets_reserved(enum bch_watermark watermark) 189 189 { 190 190 switch (watermark) { 191 - case BCH_WATERMARK_reclaim: 191 + case BCH_WATERMARK_interior_updates: 192 192 return 0; 193 + case BCH_WATERMARK_reclaim: 194 + return OPEN_BUCKETS_COUNT / 6; 193 195 case BCH_WATERMARK_btree: 194 196 case BCH_WATERMARK_btree_copygc: 195 197 return OPEN_BUCKETS_COUNT / 4;
+2 -1
fs/bcachefs/alloc_types.h
··· 22 22 x(copygc) \ 23 23 x(btree) \ 24 24 x(btree_copygc) \ 25 - x(reclaim) 25 + x(reclaim) \ 26 + x(interior_updates) 26 27 27 28 enum bch_watermark { 28 29 #define x(name) BCH_WATERMARK_##name,
+166 -7
fs/bcachefs/backpointers.c
··· 8 8 #include "btree_update.h" 9 9 #include "btree_update_interior.h" 10 10 #include "btree_write_buffer.h" 11 + #include "checksum.h" 11 12 #include "error.h" 12 13 13 14 #include <linux/mm.h> ··· 419 418 struct bkey_buf last_flushed; 420 419 }; 421 420 421 + static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree, 422 + struct bkey_s_c extent, unsigned dev) 423 + { 424 + struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent); 425 + int ret = PTR_ERR_OR_ZERO(n); 426 + if (ret) 427 + return ret; 428 + 429 + bch2_bkey_drop_device(bkey_i_to_s(n), dev); 430 + return bch2_btree_insert_trans(trans, btree, n, 0); 431 + } 432 + 433 + static int check_extent_checksum(struct btree_trans *trans, 434 + enum btree_id btree, struct bkey_s_c extent, 435 + enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev) 436 + { 437 + struct bch_fs *c = trans->c; 438 + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent); 439 + const union bch_extent_entry *entry; 440 + struct extent_ptr_decoded p; 441 + struct printbuf buf = PRINTBUF; 442 + void *data_buf = NULL; 443 + struct bio *bio = NULL; 444 + size_t bytes; 445 + int ret = 0; 446 + 447 + if (bkey_is_btree_ptr(extent.k)) 448 + return false; 449 + 450 + bkey_for_each_ptr_decode(extent.k, ptrs, p, entry) 451 + if (p.ptr.dev == dev) 452 + goto found; 453 + BUG(); 454 + found: 455 + if (!p.crc.csum_type) 456 + return false; 457 + 458 + bytes = p.crc.compressed_size << 9; 459 + 460 + struct bch_dev *ca = bch_dev_bkey_exists(c, dev); 461 + if (!bch2_dev_get_ioref(ca, READ)) 462 + return false; 463 + 464 + data_buf = kvmalloc(bytes, GFP_KERNEL); 465 + if (!data_buf) { 466 + ret = -ENOMEM; 467 + goto err; 468 + } 469 + 470 + bio = bio_alloc(ca->disk_sb.bdev, 1, REQ_OP_READ, GFP_KERNEL); 471 + bio->bi_iter.bi_sector = p.ptr.offset; 472 + bch2_bio_map(bio, data_buf, bytes); 473 + ret = submit_bio_wait(bio); 474 + if (ret) 475 + goto err; 476 + 477 + prt_str(&buf, "extents pointing to same space, but first extent checksum bad:"); 478 + prt_printf(&buf, "\n %s ", bch2_btree_id_str(btree)); 479 + bch2_bkey_val_to_text(&buf, c, extent); 480 + prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree)); 481 + bch2_bkey_val_to_text(&buf, c, extent2); 482 + 483 + struct nonce nonce = extent_nonce(extent.k->version, p.crc); 484 + struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes); 485 + if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum), 486 + c, dup_backpointer_to_bad_csum_extent, 487 + "%s", buf.buf)) 488 + ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1; 489 + fsck_err: 490 + err: 491 + if (bio) 492 + bio_put(bio); 493 + kvfree(data_buf); 494 + percpu_ref_put(&ca->io_ref); 495 + printbuf_exit(&buf); 496 + return ret; 497 + } 498 + 422 499 static int check_bp_exists(struct btree_trans *trans, 423 500 struct extents_to_bp_state *s, 424 501 struct bpos bucket, ··· 504 425 struct bkey_s_c orig_k) 505 426 { 506 427 struct bch_fs *c = trans->c; 507 - struct btree_iter bp_iter = { NULL }; 428 + struct btree_iter bp_iter = {}; 429 + struct btree_iter other_extent_iter = {}; 508 430 struct printbuf buf = PRINTBUF; 509 431 struct bkey_s_c bp_k; 510 432 struct bkey_buf tmp; ··· 513 433 514 434 bch2_bkey_buf_init(&tmp); 515 435 436 + if (!bch2_dev_bucket_exists(c, bucket)) { 437 + prt_str(&buf, "extent for nonexistent device:bucket "); 438 + bch2_bpos_to_text(&buf, bucket); 439 + prt_str(&buf, "\n "); 440 + bch2_bkey_val_to_text(&buf, c, orig_k); 441 + bch_err(c, "%s", buf.buf); 442 + return -BCH_ERR_fsck_repair_unimplemented; 443 + } 444 + 516 445 if (bpos_lt(bucket, s->bucket_start) || 517 446 bpos_gt(bucket, s->bucket_end)) 518 447 return 0; 519 - 520 - if (!bch2_dev_bucket_exists(c, bucket)) 521 - goto missing; 522 448 523 449 bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, 524 450 bucket_pos_to_bp(c, bucket, bp.bucket_offset), ··· 551 465 ret = -BCH_ERR_transaction_restart_write_buffer_flush; 552 466 goto out; 553 467 } 554 - goto missing; 468 + 469 + goto check_existing_bp; 555 470 } 556 471 out: 557 472 err: 558 473 fsck_err: 474 + bch2_trans_iter_exit(trans, &other_extent_iter); 559 475 bch2_trans_iter_exit(trans, &bp_iter); 560 476 bch2_bkey_buf_exit(&tmp, c); 561 477 printbuf_exit(&buf); 562 478 return ret; 479 + check_existing_bp: 480 + /* Do we have a backpointer for a different extent? */ 481 + if (bp_k.k->type != KEY_TYPE_backpointer) 482 + goto missing; 483 + 484 + struct bch_backpointer other_bp = *bkey_s_c_to_backpointer(bp_k).v; 485 + 486 + struct bkey_s_c other_extent = 487 + bch2_backpointer_get_key(trans, &other_extent_iter, bp_k.k->p, other_bp, 0); 488 + ret = bkey_err(other_extent); 489 + if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) 490 + ret = 0; 491 + if (ret) 492 + goto err; 493 + 494 + if (!other_extent.k) 495 + goto missing; 496 + 497 + if (bch2_extents_match(orig_k, other_extent)) { 498 + printbuf_reset(&buf); 499 + prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n "); 500 + bch2_bkey_val_to_text(&buf, c, orig_k); 501 + prt_str(&buf, "\n "); 502 + bch2_bkey_val_to_text(&buf, c, other_extent); 503 + bch_err(c, "%s", buf.buf); 504 + 505 + if (other_extent.k->size <= orig_k.k->size) { 506 + ret = drop_dev_and_update(trans, other_bp.btree_id, other_extent, bucket.inode); 507 + if (ret) 508 + goto err; 509 + goto out; 510 + } else { 511 + ret = drop_dev_and_update(trans, bp.btree_id, orig_k, bucket.inode); 512 + if (ret) 513 + goto err; 514 + goto missing; 515 + } 516 + } 517 + 518 + ret = check_extent_checksum(trans, other_bp.btree_id, other_extent, bp.btree_id, orig_k, bucket.inode); 519 + if (ret < 0) 520 + goto err; 521 + if (ret) { 522 + ret = 0; 523 + goto missing; 524 + } 525 + 526 + ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.btree_id, other_extent, bucket.inode); 527 + if (ret < 0) 528 + goto err; 529 + if (ret) { 530 + ret = 0; 531 + goto out; 532 + } 533 + 534 + printbuf_reset(&buf); 535 + prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bucket.inode); 536 + bch2_bkey_val_to_text(&buf, c, orig_k); 537 + prt_str(&buf, "\n "); 538 + bch2_bkey_val_to_text(&buf, c, other_extent); 539 + bch_err(c, "%s", buf.buf); 540 + ret = -BCH_ERR_fsck_repair_unimplemented; 541 + goto err; 563 542 missing: 543 + printbuf_reset(&buf); 564 544 prt_printf(&buf, "missing backpointer for btree=%s l=%u ", 565 545 bch2_btree_id_str(bp.btree_id), bp.level); 566 546 bch2_bkey_val_to_text(&buf, c, orig_k); 567 - prt_printf(&buf, "\nbp pos "); 568 - bch2_bpos_to_text(&buf, bp_iter.pos); 547 + prt_printf(&buf, "\n got: "); 548 + bch2_bkey_val_to_text(&buf, c, bp_k); 549 + 550 + struct bkey_i_backpointer n_bp_k; 551 + bkey_backpointer_init(&n_bp_k.k_i); 552 + n_bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); 553 + n_bp_k.v = bp; 554 + prt_printf(&buf, "\n want: "); 555 + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i)); 569 556 570 557 if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) 571 558 ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
+5
fs/bcachefs/bcachefs.h
··· 456 456 457 457 #include "alloc_types.h" 458 458 #include "btree_types.h" 459 + #include "btree_node_scan_types.h" 459 460 #include "btree_write_buffer_types.h" 460 461 #include "buckets_types.h" 461 462 #include "buckets_waiting_for_journal_types.h" ··· 615 614 */ 616 615 617 616 #define BCH_FS_FLAGS() \ 617 + x(new_fs) \ 618 618 x(started) \ 619 619 x(may_go_rw) \ 620 620 x(rw) \ ··· 798 796 u64 features; 799 797 u64 compat; 800 798 unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)]; 799 + u64 btrees_lost_data; 801 800 } sb; 802 801 803 802 ··· 1105 1102 u64 journal_entries_base_seq; 1106 1103 struct journal_keys journal_keys; 1107 1104 struct list_head journal_iters; 1105 + 1106 + struct find_btree_nodes found_btree_nodes; 1108 1107 1109 1108 u64 last_bucket_seq_cleanup; 1110 1109
+1
fs/bcachefs/bcachefs_format.h
··· 818 818 struct bch_sb_field field; 819 819 __le64 recovery_passes_required[2]; 820 820 __le64 errors_silent[8]; 821 + __le64 btrees_lost_data; 821 822 }; 822 823 823 824 struct bch_sb_field_downgrade_entry {
+195 -107
fs/bcachefs/btree_gc.c
··· 13 13 #include "btree_journal_iter.h" 14 14 #include "btree_key_cache.h" 15 15 #include "btree_locking.h" 16 + #include "btree_node_scan.h" 16 17 #include "btree_update_interior.h" 17 18 #include "btree_io.h" 18 19 #include "btree_gc.h" ··· 42 41 43 42 #define DROP_THIS_NODE 10 44 43 #define DROP_PREV_NODE 11 44 + #define DID_FILL_FROM_SCAN 12 45 45 46 46 static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) 47 47 { ··· 131 129 struct bkey_i_btree_ptr_v2 *new; 132 130 int ret; 133 131 132 + if (c->opts.verbose) { 133 + struct printbuf buf = PRINTBUF; 134 + 135 + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 136 + prt_str(&buf, " -> "); 137 + bch2_bpos_to_text(&buf, new_min); 138 + 139 + bch_info(c, "%s(): %s", __func__, buf.buf); 140 + printbuf_exit(&buf); 141 + } 142 + 134 143 new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); 135 144 if (!new) 136 145 return -BCH_ERR_ENOMEM_gc_repair_key; ··· 166 153 { 167 154 struct bkey_i_btree_ptr_v2 *new; 168 155 int ret; 156 + 157 + if (c->opts.verbose) { 158 + struct printbuf buf = PRINTBUF; 159 + 160 + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 161 + prt_str(&buf, " -> "); 162 + bch2_bpos_to_text(&buf, new_max); 163 + 164 + bch_info(c, "%s(): %s", __func__, buf.buf); 165 + printbuf_exit(&buf); 166 + } 169 167 170 168 ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p); 171 169 if (ret) ··· 209 185 return 0; 210 186 } 211 187 212 - static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, 213 - struct btree *prev, struct btree *cur) 188 + static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b, 189 + struct btree *prev, struct btree *cur, 190 + struct bpos *pulled_from_scan) 214 191 { 215 192 struct bpos expected_start = !prev 216 193 ? b->data->min_key 217 194 : bpos_successor(prev->key.k.p); 218 - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 195 + struct printbuf buf = PRINTBUF; 219 196 int ret = 0; 220 197 221 - if (!prev) { 222 - prt_printf(&buf1, "start of node: "); 223 - bch2_bpos_to_text(&buf1, b->data->min_key); 224 - } else { 225 - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key)); 198 + BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && 199 + !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, 200 + b->data->min_key)); 201 + 202 + if (bpos_eq(expected_start, cur->data->min_key)) 203 + return 0; 204 + 205 + prt_printf(&buf, " at btree %s level %u:\n parent: ", 206 + bch2_btree_id_str(b->c.btree_id), b->c.level); 207 + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 208 + 209 + if (prev) { 210 + prt_printf(&buf, "\n prev: "); 211 + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key)); 226 212 } 227 213 228 - bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key)); 214 + prt_str(&buf, "\n next: "); 215 + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key)); 229 216 230 - if (prev && 231 - bpos_gt(expected_start, cur->data->min_key) && 232 - BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { 233 - /* cur overwrites prev: */ 217 + if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */ 218 + if (b->c.level == 1 && 219 + bpos_lt(*pulled_from_scan, cur->data->min_key)) { 220 + ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, 221 + expected_start, 222 + bpos_predecessor(cur->data->min_key)); 223 + if (ret) 224 + goto err; 234 225 235 - if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key, 236 - cur->data->min_key), c, 237 - btree_node_topology_overwritten_by_next_node, 238 - "btree node overwritten by next node at btree %s level %u:\n" 239 - " node %s\n" 240 - " next %s", 241 - bch2_btree_id_str(b->c.btree_id), b->c.level, 242 - buf1.buf, buf2.buf)) { 243 - ret = DROP_PREV_NODE; 244 - goto out; 226 + *pulled_from_scan = cur->data->min_key; 227 + ret = DID_FILL_FROM_SCAN; 228 + } else { 229 + if (mustfix_fsck_err(c, btree_node_topology_bad_min_key, 230 + "btree node with incorrect min_key%s", buf.buf)) 231 + ret = set_node_min(c, cur, expected_start); 245 232 } 246 - 247 - if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p, 248 - bpos_predecessor(cur->data->min_key)), c, 249 - btree_node_topology_bad_max_key, 250 - "btree node with incorrect max_key at btree %s level %u:\n" 251 - " node %s\n" 252 - " next %s", 253 - bch2_btree_id_str(b->c.btree_id), b->c.level, 254 - buf1.buf, buf2.buf)) 255 - ret = set_node_max(c, prev, 256 - bpos_predecessor(cur->data->min_key)); 257 - } else { 258 - /* prev overwrites cur: */ 259 - 260 - if (mustfix_fsck_err_on(bpos_ge(expected_start, 261 - cur->data->max_key), c, 262 - btree_node_topology_overwritten_by_prev_node, 263 - "btree node overwritten by prev node at btree %s level %u:\n" 264 - " prev %s\n" 265 - " node %s", 266 - bch2_btree_id_str(b->c.btree_id), b->c.level, 267 - buf1.buf, buf2.buf)) { 268 - ret = DROP_THIS_NODE; 269 - goto out; 233 + } else { /* overlap */ 234 + if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */ 235 + if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */ 236 + if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_next_node, 237 + "btree node overwritten by next node%s", buf.buf)) 238 + ret = DROP_PREV_NODE; 239 + } else { 240 + if (mustfix_fsck_err(c, btree_node_topology_bad_max_key, 241 + "btree node with incorrect max_key%s", buf.buf)) 242 + ret = set_node_max(c, prev, 243 + bpos_predecessor(cur->data->min_key)); 244 + } 245 + } else { 246 + if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */ 247 + if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_prev_node, 248 + "btree node overwritten by prev node%s", buf.buf)) 249 + ret = DROP_THIS_NODE; 250 + } else { 251 + if (mustfix_fsck_err(c, btree_node_topology_bad_min_key, 252 + "btree node with incorrect min_key%s", buf.buf)) 253 + ret = set_node_min(c, cur, expected_start); 254 + } 270 255 } 271 - 272 - if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c, 273 - btree_node_topology_bad_min_key, 274 - "btree node with incorrect min_key at btree %s level %u:\n" 275 - " prev %s\n" 276 - " node %s", 277 - bch2_btree_id_str(b->c.btree_id), b->c.level, 278 - buf1.buf, buf2.buf)) 279 - ret = set_node_min(c, cur, expected_start); 280 256 } 281 - out: 257 + err: 282 258 fsck_err: 283 - printbuf_exit(&buf2); 284 - printbuf_exit(&buf1); 259 + printbuf_exit(&buf); 285 260 return ret; 286 261 } 287 262 288 263 static int btree_repair_node_end(struct bch_fs *c, struct btree *b, 289 - struct btree *child) 264 + struct btree *child, struct bpos *pulled_from_scan) 290 265 { 291 - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 266 + struct printbuf buf = PRINTBUF; 292 267 int ret = 0; 293 268 294 - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key)); 295 - bch2_bpos_to_text(&buf2, b->key.k.p); 269 + if (bpos_eq(child->key.k.p, b->key.k.p)) 270 + return 0; 296 271 297 - if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c, 298 - btree_node_topology_bad_max_key, 299 - "btree node with incorrect max_key at btree %s level %u:\n" 300 - " %s\n" 301 - " expected %s", 302 - bch2_btree_id_str(b->c.btree_id), b->c.level, 303 - buf1.buf, buf2.buf)) { 304 - ret = set_node_max(c, child, b->key.k.p); 305 - if (ret) 306 - goto err; 272 + prt_printf(&buf, "at btree %s level %u:\n parent: ", 273 + bch2_btree_id_str(b->c.btree_id), b->c.level); 274 + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 275 + 276 + prt_str(&buf, "\n child: "); 277 + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key)); 278 + 279 + if (mustfix_fsck_err(c, btree_node_topology_bad_max_key, 280 + "btree node with incorrect max_key%s", buf.buf)) { 281 + if (b->c.level == 1 && 282 + bpos_lt(*pulled_from_scan, b->key.k.p)) { 283 + ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, 284 + bpos_successor(child->key.k.p), b->key.k.p); 285 + if (ret) 286 + goto err; 287 + 288 + *pulled_from_scan = b->key.k.p; 289 + ret = DID_FILL_FROM_SCAN; 290 + } else { 291 + ret = set_node_max(c, child, b->key.k.p); 292 + } 307 293 } 308 294 err: 309 295 fsck_err: 310 - printbuf_exit(&buf2); 311 - printbuf_exit(&buf1); 296 + printbuf_exit(&buf); 312 297 return ret; 313 298 } 314 299 315 - static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b) 300 + static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b, 301 + struct bpos *pulled_from_scan) 316 302 { 317 303 struct bch_fs *c = trans->c; 318 304 struct btree_and_journal_iter iter; 319 305 struct bkey_s_c k; 320 306 struct bkey_buf prev_k, cur_k; 321 307 struct btree *prev = NULL, *cur = NULL; 322 - bool have_child, dropped_children = false; 308 + bool have_child, new_pass = false; 323 309 struct printbuf buf = PRINTBUF; 324 310 int ret = 0; 325 311 326 312 if (!b->c.level) 327 313 return 0; 328 - again: 329 - prev = NULL; 330 - have_child = dropped_children = false; 314 + 331 315 bch2_bkey_buf_init(&prev_k); 332 316 bch2_bkey_buf_init(&cur_k); 317 + again: 318 + cur = prev = NULL; 319 + have_child = new_pass = false; 333 320 bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); 334 321 iter.prefetch = true; 335 322 ··· 367 332 b->c.level - 1, 368 333 buf.buf)) { 369 334 bch2_btree_node_evict(trans, cur_k.k); 370 - ret = bch2_journal_key_delete(c, b->c.btree_id, 371 - b->c.level, cur_k.k->k.p); 372 335 cur = NULL; 336 + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: 337 + bch2_journal_key_delete(c, b->c.btree_id, 338 + b->c.level, cur_k.k->k.p); 373 339 if (ret) 374 340 break; 375 341 continue; ··· 380 344 if (ret) 381 345 break; 382 346 383 - ret = btree_repair_node_boundaries(c, b, prev, cur); 347 + if (bch2_btree_node_is_stale(c, cur)) { 348 + bch_info(c, "btree node %s older than nodes found by scanning", buf.buf); 349 + six_unlock_read(&cur->c.lock); 350 + bch2_btree_node_evict(trans, cur_k.k); 351 + ret = bch2_journal_key_delete(c, b->c.btree_id, 352 + b->c.level, cur_k.k->k.p); 353 + cur = NULL; 354 + if (ret) 355 + break; 356 + continue; 357 + } 358 + 359 + ret = btree_check_node_boundaries(c, b, prev, cur, pulled_from_scan); 360 + if (ret == DID_FILL_FROM_SCAN) { 361 + new_pass = true; 362 + ret = 0; 363 + } 384 364 385 365 if (ret == DROP_THIS_NODE) { 386 366 six_unlock_read(&cur->c.lock); ··· 422 370 break; 423 371 424 372 bch2_btree_and_journal_iter_exit(&iter); 425 - bch2_bkey_buf_exit(&prev_k, c); 426 - bch2_bkey_buf_exit(&cur_k, c); 427 373 goto again; 428 374 } else if (ret) 429 375 break; ··· 433 383 434 384 if (!ret && !IS_ERR_OR_NULL(prev)) { 435 385 BUG_ON(cur); 436 - ret = btree_repair_node_end(c, b, prev); 386 + ret = btree_repair_node_end(c, b, prev, pulled_from_scan); 387 + if (ret == DID_FILL_FROM_SCAN) { 388 + new_pass = true; 389 + ret = 0; 390 + } 437 391 } 438 392 439 393 if (!IS_ERR_OR_NULL(prev)) ··· 451 397 goto err; 452 398 453 399 bch2_btree_and_journal_iter_exit(&iter); 400 + 401 + if (new_pass) 402 + goto again; 403 + 454 404 bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); 455 405 iter.prefetch = true; 456 406 ··· 471 413 if (ret) 472 414 goto err; 473 415 474 - ret = bch2_btree_repair_topology_recurse(trans, cur); 416 + ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan); 475 417 six_unlock_read(&cur->c.lock); 476 418 cur = NULL; 477 419 ··· 479 421 bch2_btree_node_evict(trans, cur_k.k); 480 422 ret = bch2_journal_key_delete(c, b->c.btree_id, 481 423 b->c.level, cur_k.k->k.p); 482 - dropped_children = true; 424 + new_pass = true; 483 425 } 484 426 485 427 if (ret) ··· 506 448 six_unlock_read(&cur->c.lock); 507 449 508 450 bch2_btree_and_journal_iter_exit(&iter); 509 - bch2_bkey_buf_exit(&prev_k, c); 510 - bch2_bkey_buf_exit(&cur_k, c); 511 451 512 - if (!ret && dropped_children) 452 + if (!ret && new_pass) 513 453 goto again; 514 454 455 + BUG_ON(!ret && bch2_btree_node_check_topology(trans, b)); 456 + 457 + bch2_bkey_buf_exit(&prev_k, c); 458 + bch2_bkey_buf_exit(&cur_k, c); 515 459 printbuf_exit(&buf); 516 460 return ret; 517 461 } ··· 521 461 int bch2_check_topology(struct bch_fs *c) 522 462 { 523 463 struct btree_trans *trans = bch2_trans_get(c); 524 - struct btree *b; 525 - unsigned i; 464 + struct bpos pulled_from_scan = POS_MIN; 526 465 int ret = 0; 527 466 528 - for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { 467 + for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { 529 468 struct btree_root *r = bch2_btree_id_root(c, i); 469 + bool reconstructed_root = false; 530 470 531 - if (!r->alive) 532 - continue; 471 + if (r->error) { 472 + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); 473 + if (ret) 474 + break; 475 + reconstruct_root: 476 + bch_info(c, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i)); 533 477 534 - b = r->b; 535 - if (btree_node_fake(b)) 536 - continue; 478 + r->alive = false; 479 + r->error = 0; 480 + 481 + if (!bch2_btree_has_scanned_nodes(c, i)) { 482 + mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing, 483 + "no nodes found for btree %s, continue?", bch2_btree_id_str(i)); 484 + bch2_btree_root_alloc_fake(c, i, 0); 485 + } else { 486 + bch2_btree_root_alloc_fake(c, i, 1); 487 + ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); 488 + if (ret) 489 + break; 490 + } 491 + 492 + bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); 493 + reconstructed_root = true; 494 + } 495 + 496 + struct btree *b = r->b; 537 497 538 498 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); 539 - ret = bch2_btree_repair_topology_recurse(trans, b); 499 + ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan); 540 500 six_unlock_read(&b->c.lock); 541 501 542 502 if (ret == DROP_THIS_NODE) { 543 - bch_err(c, "empty btree root - repair unimplemented"); 544 - ret = -BCH_ERR_fsck_repair_unimplemented; 503 + bch2_btree_node_hash_remove(&c->btree_cache, b); 504 + mutex_lock(&c->btree_cache.lock); 505 + list_move(&b->list, &c->btree_cache.freeable); 506 + mutex_unlock(&c->btree_cache.lock); 507 + 508 + r->b = NULL; 509 + 510 + if (!reconstructed_root) 511 + goto reconstruct_root; 512 + 513 + bch_err(c, "empty btree root %s", bch2_btree_id_str(i)); 514 + bch2_btree_root_alloc_fake(c, i, 0); 515 + r->alive = false; 516 + ret = 0; 545 517 } 546 518 } 547 - 519 + fsck_err: 548 520 bch2_trans_put(trans); 549 - 550 521 return ret; 551 522 } 552 523 ··· 1021 930 int ret = 0; 1022 931 1023 932 b = bch2_btree_id_root(c, btree_id)->b; 1024 - 1025 - if (btree_node_fake(b)) 1026 - return 0; 1027 933 1028 934 six_lock_read(&b->c.lock, NULL, NULL); 1029 935 printbuf_reset(&buf);
+10 -5
fs/bcachefs/btree_io.c
··· 1264 1264 return retry_read; 1265 1265 fsck_err: 1266 1266 if (ret == -BCH_ERR_btree_node_read_err_want_retry || 1267 - ret == -BCH_ERR_btree_node_read_err_must_retry) 1267 + ret == -BCH_ERR_btree_node_read_err_must_retry) { 1268 1268 retry_read = 1; 1269 - else 1269 + } else { 1270 1270 set_btree_node_read_error(b); 1271 + bch2_btree_lost_data(c, b->c.btree_id); 1272 + } 1271 1273 goto out; 1272 1274 } 1273 1275 ··· 1330 1328 1331 1329 if (!can_retry) { 1332 1330 set_btree_node_read_error(b); 1331 + bch2_btree_lost_data(c, b->c.btree_id); 1333 1332 break; 1334 1333 } 1335 1334 } ··· 1530 1527 ret = -1; 1531 1528 } 1532 1529 1533 - if (ret) 1530 + if (ret) { 1534 1531 set_btree_node_read_error(b); 1535 - else if (*saw_error) 1532 + bch2_btree_lost_data(c, b->c.btree_id); 1533 + } else if (*saw_error) 1536 1534 bch2_btree_node_rewrite_async(c, b); 1537 1535 1538 1536 for (i = 0; i < ra->nr; i++) { ··· 1669 1665 bch2_fatal_error(c); 1670 1666 1671 1667 set_btree_node_read_error(b); 1668 + bch2_btree_lost_data(c, b->c.btree_id); 1672 1669 clear_btree_node_read_in_flight(b); 1673 1670 wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); 1674 1671 printbuf_exit(&buf); ··· 1866 1861 } else { 1867 1862 ret = bch2_trans_do(c, NULL, NULL, 0, 1868 1863 bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, 1869 - BCH_WATERMARK_reclaim| 1864 + BCH_WATERMARK_interior_updates| 1870 1865 BCH_TRANS_COMMIT_journal_reclaim| 1871 1866 BCH_TRANS_COMMIT_no_enospc| 1872 1867 BCH_TRANS_COMMIT_no_check_rw,
+19
fs/bcachefs/btree_journal_iter.c
··· 567 567 bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr); 568 568 return 0; 569 569 } 570 + 571 + void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree, 572 + unsigned level_min, unsigned level_max, 573 + struct bpos start, struct bpos end) 574 + { 575 + struct journal_keys *keys = &c->journal_keys; 576 + size_t dst = 0; 577 + 578 + move_gap(keys, keys->nr); 579 + 580 + darray_for_each(*keys, i) 581 + if (!(i->btree_id == btree && 582 + i->level >= level_min && 583 + i->level <= level_max && 584 + bpos_ge(i->k->k.p, start) && 585 + bpos_le(i->k->k.p, end))) 586 + keys->data[dst++] = *i; 587 + keys->nr = keys->gap = dst; 588 + }
+4
fs/bcachefs/btree_journal_iter.h
··· 66 66 67 67 int bch2_journal_keys_sort(struct bch_fs *); 68 68 69 + void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, 70 + unsigned, unsigned, 71 + struct bpos, struct bpos); 72 + 69 73 #endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
+495
fs/bcachefs/btree_node_scan.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include "bcachefs.h" 4 + #include "btree_cache.h" 5 + #include "btree_io.h" 6 + #include "btree_journal_iter.h" 7 + #include "btree_node_scan.h" 8 + #include "btree_update_interior.h" 9 + #include "buckets.h" 10 + #include "error.h" 11 + #include "journal_io.h" 12 + #include "recovery_passes.h" 13 + 14 + #include <linux/kthread.h> 15 + #include <linux/sort.h> 16 + 17 + struct find_btree_nodes_worker { 18 + struct closure *cl; 19 + struct find_btree_nodes *f; 20 + struct bch_dev *ca; 21 + }; 22 + 23 + static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n) 24 + { 25 + prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie); 26 + bch2_bpos_to_text(out, n->min_key); 27 + prt_str(out, "-"); 28 + bch2_bpos_to_text(out, n->max_key); 29 + 30 + if (n->range_updated) 31 + prt_str(out, " range updated"); 32 + if (n->overwritten) 33 + prt_str(out, " overwritten"); 34 + 35 + for (unsigned i = 0; i < n->nr_ptrs; i++) { 36 + prt_char(out, ' '); 37 + bch2_extent_ptr_to_text(out, c, n->ptrs + i); 38 + } 39 + } 40 + 41 + static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes) 42 + { 43 + printbuf_indent_add(out, 2); 44 + darray_for_each(nodes, i) { 45 + found_btree_node_to_text(out, c, i); 46 + prt_newline(out); 47 + } 48 + printbuf_indent_sub(out, 2); 49 + } 50 + 51 + static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f) 52 + { 53 + struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k); 54 + 55 + set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs); 56 + bp->k.p = f->max_key; 57 + bp->v.seq = cpu_to_le64(f->cookie); 58 + bp->v.sectors_written = 0; 59 + bp->v.flags = 0; 60 + bp->v.min_key = f->min_key; 61 + SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated); 62 + memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs); 63 + } 64 + 65 + static bool found_btree_node_is_readable(struct btree_trans *trans, 66 + const struct found_btree_node *f) 67 + { 68 + struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k; 69 + 70 + found_btree_node_to_key(&k.k, f); 71 + 72 + struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false); 73 + bool ret = !IS_ERR_OR_NULL(b); 74 + if (ret) 75 + six_unlock_read(&b->c.lock); 76 + 77 + /* 78 + * We might update this node's range; if that happens, we need the node 79 + * to be re-read so the read path can trim keys that are no longer in 80 + * this node 81 + */ 82 + if (b != btree_node_root(trans->c, b)) 83 + bch2_btree_node_evict(trans, &k.k); 84 + return ret; 85 + } 86 + 87 + static int found_btree_node_cmp_cookie(const void *_l, const void *_r) 88 + { 89 + const struct found_btree_node *l = _l; 90 + const struct found_btree_node *r = _r; 91 + 92 + return cmp_int(l->btree_id, r->btree_id) ?: 93 + cmp_int(l->level, r->level) ?: 94 + cmp_int(l->cookie, r->cookie); 95 + } 96 + 97 + /* 98 + * Given two found btree nodes, if their sequence numbers are equal, take the 99 + * one that's readable: 100 + */ 101 + static int found_btree_node_cmp_time(const struct found_btree_node *l, 102 + const struct found_btree_node *r) 103 + { 104 + return cmp_int(l->seq, r->seq); 105 + } 106 + 107 + static int found_btree_node_cmp_pos(const void *_l, const void *_r) 108 + { 109 + const struct found_btree_node *l = _l; 110 + const struct found_btree_node *r = _r; 111 + 112 + return cmp_int(l->btree_id, r->btree_id) ?: 113 + -cmp_int(l->level, r->level) ?: 114 + bpos_cmp(l->min_key, r->min_key) ?: 115 + -found_btree_node_cmp_time(l, r); 116 + } 117 + 118 + static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, 119 + struct bio *bio, struct btree_node *bn, u64 offset) 120 + { 121 + struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); 122 + 123 + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); 124 + bio->bi_iter.bi_sector = offset; 125 + bch2_bio_map(bio, bn, PAGE_SIZE); 126 + 127 + submit_bio_wait(bio); 128 + if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, 129 + "IO error in try_read_btree_node() at %llu: %s", 130 + offset, bch2_blk_status_to_str(bio->bi_status))) 131 + return; 132 + 133 + if (le64_to_cpu(bn->magic) != bset_magic(c)) 134 + return; 135 + 136 + rcu_read_lock(); 137 + struct found_btree_node n = { 138 + .btree_id = BTREE_NODE_ID(bn), 139 + .level = BTREE_NODE_LEVEL(bn), 140 + .seq = BTREE_NODE_SEQ(bn), 141 + .cookie = le64_to_cpu(bn->keys.seq), 142 + .min_key = bn->min_key, 143 + .max_key = bn->max_key, 144 + .nr_ptrs = 1, 145 + .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr, 146 + .ptrs[0].offset = offset, 147 + .ptrs[0].dev = ca->dev_idx, 148 + .ptrs[0].gen = *bucket_gen(ca, sector_to_bucket(ca, offset)), 149 + }; 150 + rcu_read_unlock(); 151 + 152 + if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) { 153 + mutex_lock(&f->lock); 154 + if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) { 155 + bch_err(c, "try_read_btree_node() can't handle endian conversion"); 156 + f->ret = -EINVAL; 157 + goto unlock; 158 + } 159 + 160 + if (darray_push(&f->nodes, n)) 161 + f->ret = -ENOMEM; 162 + unlock: 163 + mutex_unlock(&f->lock); 164 + } 165 + } 166 + 167 + static int read_btree_nodes_worker(void *p) 168 + { 169 + struct find_btree_nodes_worker *w = p; 170 + struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); 171 + struct bch_dev *ca = w->ca; 172 + void *buf = (void *) __get_free_page(GFP_KERNEL); 173 + struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL); 174 + unsigned long last_print = jiffies; 175 + 176 + if (!buf || !bio) { 177 + bch_err(c, "read_btree_nodes_worker: error allocating bio/buf"); 178 + w->f->ret = -ENOMEM; 179 + goto err; 180 + } 181 + 182 + for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++) 183 + for (unsigned bucket_offset = 0; 184 + bucket_offset + btree_sectors(c) <= ca->mi.bucket_size; 185 + bucket_offset += btree_sectors(c)) { 186 + if (time_after(jiffies, last_print + HZ * 30)) { 187 + u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset; 188 + u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size; 189 + 190 + bch_info(ca, "%s: %2u%% done", __func__, 191 + (unsigned) div64_u64(cur_sector * 100, end_sector)); 192 + last_print = jiffies; 193 + } 194 + 195 + try_read_btree_node(w->f, ca, bio, buf, 196 + bucket * ca->mi.bucket_size + bucket_offset); 197 + } 198 + err: 199 + bio_put(bio); 200 + free_page((unsigned long) buf); 201 + percpu_ref_get(&ca->io_ref); 202 + closure_put(w->cl); 203 + kfree(w); 204 + return 0; 205 + } 206 + 207 + static int read_btree_nodes(struct find_btree_nodes *f) 208 + { 209 + struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); 210 + struct closure cl; 211 + int ret = 0; 212 + 213 + closure_init_stack(&cl); 214 + 215 + for_each_online_member(c, ca) { 216 + struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); 217 + struct task_struct *t; 218 + 219 + if (!w) { 220 + percpu_ref_put(&ca->io_ref); 221 + ret = -ENOMEM; 222 + goto err; 223 + } 224 + 225 + percpu_ref_get(&ca->io_ref); 226 + closure_get(&cl); 227 + w->cl = &cl; 228 + w->f = f; 229 + w->ca = ca; 230 + 231 + t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); 232 + ret = IS_ERR_OR_NULL(t); 233 + if (ret) { 234 + percpu_ref_put(&ca->io_ref); 235 + closure_put(&cl); 236 + f->ret = ret; 237 + bch_err(c, "error starting kthread: %i", ret); 238 + break; 239 + } 240 + } 241 + err: 242 + closure_sync(&cl); 243 + return f->ret ?: ret; 244 + } 245 + 246 + static void bubble_up(struct found_btree_node *n, struct found_btree_node *end) 247 + { 248 + while (n + 1 < end && 249 + found_btree_node_cmp_pos(n, n + 1) > 0) { 250 + swap(n[0], n[1]); 251 + n++; 252 + } 253 + } 254 + 255 + static int handle_overwrites(struct bch_fs *c, 256 + struct found_btree_node *start, 257 + struct found_btree_node *end) 258 + { 259 + struct found_btree_node *n; 260 + again: 261 + for (n = start + 1; 262 + n < end && 263 + n->btree_id == start->btree_id && 264 + n->level == start->level && 265 + bpos_lt(n->min_key, start->max_key); 266 + n++) { 267 + int cmp = found_btree_node_cmp_time(start, n); 268 + 269 + if (cmp > 0) { 270 + if (bpos_cmp(start->max_key, n->max_key) >= 0) 271 + n->overwritten = true; 272 + else { 273 + n->range_updated = true; 274 + n->min_key = bpos_successor(start->max_key); 275 + n->range_updated = true; 276 + bubble_up(n, end); 277 + goto again; 278 + } 279 + } else if (cmp < 0) { 280 + BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0); 281 + 282 + start->max_key = bpos_predecessor(n->min_key); 283 + start->range_updated = true; 284 + } else { 285 + struct printbuf buf = PRINTBUF; 286 + 287 + prt_str(&buf, "overlapping btree nodes with same seq! halting\n "); 288 + found_btree_node_to_text(&buf, c, start); 289 + prt_str(&buf, "\n "); 290 + found_btree_node_to_text(&buf, c, n); 291 + bch_err(c, "%s", buf.buf); 292 + printbuf_exit(&buf); 293 + return -1; 294 + } 295 + } 296 + 297 + return 0; 298 + } 299 + 300 + int bch2_scan_for_btree_nodes(struct bch_fs *c) 301 + { 302 + struct find_btree_nodes *f = &c->found_btree_nodes; 303 + struct printbuf buf = PRINTBUF; 304 + size_t dst; 305 + int ret = 0; 306 + 307 + if (f->nodes.nr) 308 + return 0; 309 + 310 + mutex_init(&f->lock); 311 + 312 + ret = read_btree_nodes(f); 313 + if (ret) 314 + return ret; 315 + 316 + if (!f->nodes.nr) { 317 + bch_err(c, "%s: no btree nodes found", __func__); 318 + ret = -EINVAL; 319 + goto err; 320 + } 321 + 322 + if (0 && c->opts.verbose) { 323 + printbuf_reset(&buf); 324 + prt_printf(&buf, "%s: nodes found:\n", __func__); 325 + found_btree_nodes_to_text(&buf, c, f->nodes); 326 + bch2_print_string_as_lines(KERN_INFO, buf.buf); 327 + } 328 + 329 + sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); 330 + 331 + dst = 0; 332 + darray_for_each(f->nodes, i) { 333 + struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL; 334 + 335 + if (prev && 336 + prev->cookie == i->cookie) { 337 + if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) { 338 + bch_err(c, "%s: found too many replicas for btree node", __func__); 339 + ret = -EINVAL; 340 + goto err; 341 + } 342 + prev->ptrs[prev->nr_ptrs++] = i->ptrs[0]; 343 + } else { 344 + f->nodes.data[dst++] = *i; 345 + } 346 + } 347 + f->nodes.nr = dst; 348 + 349 + sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); 350 + 351 + if (0 && c->opts.verbose) { 352 + printbuf_reset(&buf); 353 + prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__); 354 + found_btree_nodes_to_text(&buf, c, f->nodes); 355 + bch2_print_string_as_lines(KERN_INFO, buf.buf); 356 + } 357 + 358 + dst = 0; 359 + darray_for_each(f->nodes, i) { 360 + if (i->overwritten) 361 + continue; 362 + 363 + ret = handle_overwrites(c, i, &darray_top(f->nodes)); 364 + if (ret) 365 + goto err; 366 + 367 + BUG_ON(i->overwritten); 368 + f->nodes.data[dst++] = *i; 369 + } 370 + f->nodes.nr = dst; 371 + 372 + if (c->opts.verbose) { 373 + printbuf_reset(&buf); 374 + prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); 375 + found_btree_nodes_to_text(&buf, c, f->nodes); 376 + bch2_print_string_as_lines(KERN_INFO, buf.buf); 377 + } 378 + 379 + eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); 380 + err: 381 + printbuf_exit(&buf); 382 + return ret; 383 + } 384 + 385 + static int found_btree_node_range_start_cmp(const void *_l, const void *_r) 386 + { 387 + const struct found_btree_node *l = _l; 388 + const struct found_btree_node *r = _r; 389 + 390 + return cmp_int(l->btree_id, r->btree_id) ?: 391 + -cmp_int(l->level, r->level) ?: 392 + bpos_cmp(l->max_key, r->min_key); 393 + } 394 + 395 + #define for_each_found_btree_node_in_range(_f, _search, _idx) \ 396 + for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \ 397 + sizeof((_f)->nodes.data[0]), \ 398 + found_btree_node_range_start_cmp, &search); \ 399 + _idx < (_f)->nodes.nr && \ 400 + (_f)->nodes.data[_idx].btree_id == _search.btree_id && \ 401 + (_f)->nodes.data[_idx].level == _search.level && \ 402 + bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \ 403 + _idx = eytzinger0_next(_idx, (_f)->nodes.nr)) 404 + 405 + bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b) 406 + { 407 + struct find_btree_nodes *f = &c->found_btree_nodes; 408 + 409 + struct found_btree_node search = { 410 + .btree_id = b->c.btree_id, 411 + .level = b->c.level, 412 + .min_key = b->data->min_key, 413 + .max_key = b->key.k.p, 414 + }; 415 + 416 + for_each_found_btree_node_in_range(f, search, idx) 417 + if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data)) 418 + return true; 419 + return false; 420 + } 421 + 422 + bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) 423 + { 424 + struct found_btree_node search = { 425 + .btree_id = btree, 426 + .level = 0, 427 + .min_key = POS_MIN, 428 + .max_key = SPOS_MAX, 429 + }; 430 + 431 + for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx) 432 + return true; 433 + return false; 434 + } 435 + 436 + int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, 437 + unsigned level, struct bpos node_min, struct bpos node_max) 438 + { 439 + struct find_btree_nodes *f = &c->found_btree_nodes; 440 + 441 + int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); 442 + if (ret) 443 + return ret; 444 + 445 + if (c->opts.verbose) { 446 + struct printbuf buf = PRINTBUF; 447 + 448 + prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level); 449 + bch2_bpos_to_text(&buf, node_min); 450 + prt_str(&buf, " - "); 451 + bch2_bpos_to_text(&buf, node_max); 452 + 453 + bch_info(c, "%s(): %s", __func__, buf.buf); 454 + printbuf_exit(&buf); 455 + } 456 + 457 + struct found_btree_node search = { 458 + .btree_id = btree, 459 + .level = level, 460 + .min_key = node_min, 461 + .max_key = node_max, 462 + }; 463 + 464 + for_each_found_btree_node_in_range(f, search, idx) { 465 + struct found_btree_node n = f->nodes.data[idx]; 466 + 467 + n.range_updated |= bpos_lt(n.min_key, node_min); 468 + n.min_key = bpos_max(n.min_key, node_min); 469 + 470 + n.range_updated |= bpos_gt(n.max_key, node_max); 471 + n.max_key = bpos_min(n.max_key, node_max); 472 + 473 + struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; 474 + 475 + found_btree_node_to_key(&tmp.k, &n); 476 + 477 + struct printbuf buf = PRINTBUF; 478 + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); 479 + bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); 480 + printbuf_exit(&buf); 481 + 482 + BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL)); 483 + 484 + ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); 485 + if (ret) 486 + return ret; 487 + } 488 + 489 + return 0; 490 + } 491 + 492 + void bch2_find_btree_nodes_exit(struct find_btree_nodes *f) 493 + { 494 + darray_exit(&f->nodes); 495 + }
+11
fs/bcachefs/btree_node_scan.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _BCACHEFS_BTREE_NODE_SCAN_H 3 + #define _BCACHEFS_BTREE_NODE_SCAN_H 4 + 5 + int bch2_scan_for_btree_nodes(struct bch_fs *); 6 + bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *); 7 + bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id); 8 + int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos); 9 + void bch2_find_btree_nodes_exit(struct find_btree_nodes *); 10 + 11 + #endif /* _BCACHEFS_BTREE_NODE_SCAN_H */
+30
fs/bcachefs/btree_node_scan_types.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H 3 + #define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H 4 + 5 + #include "darray.h" 6 + 7 + struct found_btree_node { 8 + bool range_updated:1; 9 + bool overwritten:1; 10 + u8 btree_id; 11 + u8 level; 12 + u32 seq; 13 + u64 cookie; 14 + 15 + struct bpos min_key; 16 + struct bpos max_key; 17 + 18 + unsigned nr_ptrs; 19 + struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX]; 20 + }; 21 + 22 + typedef DARRAY(struct found_btree_node) found_btree_nodes; 23 + 24 + struct find_btree_nodes { 25 + int ret; 26 + struct mutex lock; 27 + found_btree_nodes nodes; 28 + }; 29 + 30 + #endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */
+2 -1
fs/bcachefs/btree_trans_commit.c
··· 887 887 int ret, unsigned long trace_ip) 888 888 { 889 889 struct bch_fs *c = trans->c; 890 + enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; 890 891 891 892 switch (ret) { 892 893 case -BCH_ERR_btree_insert_btree_node_full: ··· 906 905 * flag 907 906 */ 908 907 if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && 909 - (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { 908 + watermark < BCH_WATERMARK_reclaim) { 910 909 ret = -BCH_ERR_journal_reclaim_would_deadlock; 911 910 break; 912 911 }
+36 -21
fs/bcachefs/btree_update_interior.c
··· 26 26 27 27 #include <linux/random.h> 28 28 29 + const char * const bch2_btree_update_modes[] = { 30 + #define x(t) #t, 31 + BCH_WATERMARKS() 32 + #undef x 33 + NULL 34 + }; 35 + 29 36 static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, 30 37 btree_path_idx_t, struct btree *, struct keylist *); 31 38 static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); ··· 310 303 struct open_buckets obs = { .nr = 0 }; 311 304 struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; 312 305 enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; 313 - unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim 306 + unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim 314 307 ? BTREE_NODE_RESERVE 315 308 : 0; 316 309 int ret; ··· 694 687 * which may require allocations as well. 695 688 */ 696 689 ret = commit_do(trans, &as->disk_res, &journal_seq, 697 - BCH_WATERMARK_reclaim| 690 + BCH_WATERMARK_interior_updates| 698 691 BCH_TRANS_COMMIT_no_enospc| 699 692 BCH_TRANS_COMMIT_no_check_rw| 700 693 BCH_TRANS_COMMIT_journal_reclaim, ··· 853 846 mutex_lock(&c->btree_interior_update_lock); 854 847 list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); 855 848 856 - BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); 849 + BUG_ON(as->mode != BTREE_UPDATE_none); 857 850 BUG_ON(!btree_node_dirty(b)); 858 851 BUG_ON(!b->c.level); 859 852 860 - as->mode = BTREE_INTERIOR_UPDATING_NODE; 853 + as->mode = BTREE_UPDATE_node; 861 854 as->b = b; 862 855 863 856 set_btree_node_write_blocked(b); ··· 880 873 lockdep_assert_held(&c->btree_interior_update_lock); 881 874 882 875 child->b = NULL; 883 - child->mode = BTREE_INTERIOR_UPDATING_AS; 876 + child->mode = BTREE_UPDATE_update; 884 877 885 878 bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, 886 879 bch2_update_reparent_journal_pin_flush); ··· 891 884 struct bkey_i *insert = &b->key; 892 885 struct bch_fs *c = as->c; 893 886 894 - BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); 887 + BUG_ON(as->mode != BTREE_UPDATE_none); 895 888 896 889 BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > 897 890 ARRAY_SIZE(as->journal_entries)); ··· 905 898 mutex_lock(&c->btree_interior_update_lock); 906 899 list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); 907 900 908 - as->mode = BTREE_INTERIOR_UPDATING_ROOT; 901 + as->mode = BTREE_UPDATE_root; 909 902 mutex_unlock(&c->btree_interior_update_lock); 910 903 } 911 904 ··· 1083 1076 struct bch_fs *c = as->c; 1084 1077 u64 start_time = as->start_time; 1085 1078 1086 - BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); 1079 + BUG_ON(as->mode == BTREE_UPDATE_none); 1087 1080 1088 1081 if (as->took_gc_lock) 1089 1082 up_read(&as->c->gc_lock); ··· 1128 1121 unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK; 1129 1122 1130 1123 if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && 1131 - watermark != BCH_WATERMARK_reclaim) 1124 + watermark < BCH_WATERMARK_reclaim) 1132 1125 journal_flags |= JOURNAL_RES_GET_NONBLOCK; 1133 1126 1134 1127 ret = drop_locks_do(trans, ··· 1179 1172 as->c = c; 1180 1173 as->start_time = start_time; 1181 1174 as->ip_started = _RET_IP_; 1182 - as->mode = BTREE_INTERIOR_NO_UPDATE; 1175 + as->mode = BTREE_UPDATE_none; 1176 + as->watermark = watermark; 1183 1177 as->took_gc_lock = true; 1184 1178 as->btree_id = path->btree_id; 1185 1179 as->update_level = update_level; ··· 1225 1217 */ 1226 1218 if (bch2_err_matches(ret, ENOSPC) && 1227 1219 (flags & BCH_TRANS_COMMIT_journal_reclaim) && 1228 - watermark != BCH_WATERMARK_reclaim) { 1220 + watermark < BCH_WATERMARK_reclaim) { 1229 1221 ret = -BCH_ERR_journal_reclaim_would_deadlock; 1230 1222 goto err; 1231 1223 } ··· 2466 2458 bch2_btree_set_root_inmem(c, b); 2467 2459 } 2468 2460 2469 - static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) 2461 + static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id id, unsigned level) 2470 2462 { 2471 2463 struct bch_fs *c = trans->c; 2472 2464 struct closure cl; ··· 2485 2477 2486 2478 set_btree_node_fake(b); 2487 2479 set_btree_node_need_rewrite(b); 2488 - b->c.level = 0; 2480 + b->c.level = level; 2489 2481 b->c.btree_id = id; 2490 2482 2491 2483 bkey_btree_ptr_init(&b->key); ··· 2512 2504 return 0; 2513 2505 } 2514 2506 2515 - void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) 2507 + void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) 2516 2508 { 2517 - bch2_trans_run(c, __bch2_btree_root_alloc(trans, id)); 2509 + bch2_trans_run(c, __bch2_btree_root_alloc_fake(trans, id, level)); 2510 + } 2511 + 2512 + static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) 2513 + { 2514 + prt_printf(out, "%ps: btree=%s watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", 2515 + (void *) as->ip_started, 2516 + bch2_btree_id_str(as->btree_id), 2517 + bch2_watermarks[as->watermark], 2518 + bch2_btree_update_modes[as->mode], 2519 + as->nodes_written, 2520 + closure_nr_remaining(&as->cl), 2521 + as->journal.seq); 2518 2522 } 2519 2523 2520 2524 void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) ··· 2535 2515 2536 2516 mutex_lock(&c->btree_interior_update_lock); 2537 2517 list_for_each_entry(as, &c->btree_interior_update_list, list) 2538 - prt_printf(out, "%ps: mode=%u nodes_written=%u cl.remaining=%u journal_seq=%llu\n", 2539 - (void *) as->ip_started, 2540 - as->mode, 2541 - as->nodes_written, 2542 - closure_nr_remaining(&as->cl), 2543 - as->journal.seq); 2518 + bch2_btree_update_to_text(out, as); 2544 2519 mutex_unlock(&c->btree_interior_update_lock); 2545 2520 } 2546 2521
+16 -10
fs/bcachefs/btree_update_interior.h
··· 12 12 13 13 int bch2_btree_node_check_topology(struct btree_trans *, struct btree *); 14 14 15 + #define BTREE_UPDATE_MODES() \ 16 + x(none) \ 17 + x(node) \ 18 + x(root) \ 19 + x(update) 20 + 21 + enum btree_update_mode { 22 + #define x(n) BTREE_UPDATE_##n, 23 + BTREE_UPDATE_MODES() 24 + #undef x 25 + }; 26 + 15 27 /* 16 28 * Tracks an in progress split/rewrite of a btree node and the update to the 17 29 * parent node: ··· 51 39 struct list_head list; 52 40 struct list_head unwritten_list; 53 41 54 - /* What kind of update are we doing? */ 55 - enum { 56 - BTREE_INTERIOR_NO_UPDATE, 57 - BTREE_INTERIOR_UPDATING_NODE, 58 - BTREE_INTERIOR_UPDATING_ROOT, 59 - BTREE_INTERIOR_UPDATING_AS, 60 - } mode; 61 - 42 + enum btree_update_mode mode; 43 + enum bch_watermark watermark; 62 44 unsigned nodes_written:1; 63 45 unsigned took_gc_lock:1; 64 46 ··· 62 56 struct disk_reservation disk_res; 63 57 64 58 /* 65 - * BTREE_INTERIOR_UPDATING_NODE: 59 + * BTREE_UPDATE_node: 66 60 * The update that made the new nodes visible was a regular update to an 67 61 * existing interior node - @b. We can't write out the update to @b 68 62 * until the new nodes we created are finished writing, so we block @b ··· 171 165 struct bkey_i *, unsigned, bool); 172 166 173 167 void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); 174 - void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); 168 + void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned); 175 169 176 170 static inline unsigned btree_update_reserve_required(struct bch_fs *c, 177 171 struct btree *b)
+1
fs/bcachefs/buckets.h
··· 226 226 fallthrough; 227 227 case BCH_WATERMARK_btree_copygc: 228 228 case BCH_WATERMARK_reclaim: 229 + case BCH_WATERMARK_interior_updates: 229 230 break; 230 231 } 231 232
+1 -2
fs/bcachefs/data_update.c
··· 580 580 move_ctxt_wait_event(ctxt, 581 581 (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, 582 582 PTR_BUCKET_POS(c, &p.ptr), 0)) || 583 - (!atomic_read(&ctxt->read_sectors) && 584 - !atomic_read(&ctxt->write_sectors))); 583 + list_empty(&ctxt->ios)); 585 584 586 585 if (!locked) 587 586 bch2_bucket_nocow_lock(&c->nocow_locks,
+28 -24
fs/bcachefs/extents.c
··· 978 978 return bkey_deleted(k.k); 979 979 } 980 980 981 + void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr) 982 + { 983 + struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] 984 + ? bch_dev_bkey_exists(c, ptr->dev) 985 + : NULL; 986 + 987 + if (!ca) { 988 + prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, 989 + (u64) ptr->offset, ptr->gen, 990 + ptr->cached ? " cached" : ""); 991 + } else { 992 + u32 offset; 993 + u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); 994 + 995 + prt_printf(out, "ptr: %u:%llu:%u gen %u", 996 + ptr->dev, b, offset, ptr->gen); 997 + if (ptr->cached) 998 + prt_str(out, " cached"); 999 + if (ptr->unwritten) 1000 + prt_str(out, " unwritten"); 1001 + if (ca && ptr_stale(ca, ptr)) 1002 + prt_printf(out, " stale"); 1003 + } 1004 + } 1005 + 981 1006 void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 982 1007 struct bkey_s_c k) 983 1008 { ··· 1018 993 prt_printf(out, " "); 1019 994 1020 995 switch (__extent_entry_type(entry)) { 1021 - case BCH_EXTENT_ENTRY_ptr: { 1022 - const struct bch_extent_ptr *ptr = entry_to_ptr(entry); 1023 - struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] 1024 - ? bch_dev_bkey_exists(c, ptr->dev) 1025 - : NULL; 1026 - 1027 - if (!ca) { 1028 - prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, 1029 - (u64) ptr->offset, ptr->gen, 1030 - ptr->cached ? " cached" : ""); 1031 - } else { 1032 - u32 offset; 1033 - u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); 1034 - 1035 - prt_printf(out, "ptr: %u:%llu:%u gen %u", 1036 - ptr->dev, b, offset, ptr->gen); 1037 - if (ptr->cached) 1038 - prt_str(out, " cached"); 1039 - if (ptr->unwritten) 1040 - prt_str(out, " unwritten"); 1041 - if (ca && ptr_stale(ca, ptr)) 1042 - prt_printf(out, " stale"); 1043 - } 996 + case BCH_EXTENT_ENTRY_ptr: 997 + bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry)); 1044 998 break; 1045 - } 999 + 1046 1000 case BCH_EXTENT_ENTRY_crc32: 1047 1001 case BCH_EXTENT_ENTRY_crc64: 1048 1002 case BCH_EXTENT_ENTRY_crc128: {
+1
fs/bcachefs/extents.h
··· 676 676 void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *); 677 677 678 678 bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); 679 + void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *); 679 680 void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, 680 681 struct bkey_s_c); 681 682 int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c,
+234
fs/bcachefs/eytzinger.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include "eytzinger.h" 4 + 5 + /** 6 + * is_aligned - is this pointer & size okay for word-wide copying? 7 + * @base: pointer to data 8 + * @size: size of each element 9 + * @align: required alignment (typically 4 or 8) 10 + * 11 + * Returns true if elements can be copied using word loads and stores. 12 + * The size must be a multiple of the alignment, and the base address must 13 + * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. 14 + * 15 + * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)" 16 + * to "if ((a | b) & mask)", so we do that by hand. 17 + */ 18 + __attribute_const__ __always_inline 19 + static bool is_aligned(const void *base, size_t size, unsigned char align) 20 + { 21 + unsigned char lsbits = (unsigned char)size; 22 + 23 + (void)base; 24 + #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 25 + lsbits |= (unsigned char)(uintptr_t)base; 26 + #endif 27 + return (lsbits & (align - 1)) == 0; 28 + } 29 + 30 + /** 31 + * swap_words_32 - swap two elements in 32-bit chunks 32 + * @a: pointer to the first element to swap 33 + * @b: pointer to the second element to swap 34 + * @n: element size (must be a multiple of 4) 35 + * 36 + * Exchange the two objects in memory. This exploits base+index addressing, 37 + * which basically all CPUs have, to minimize loop overhead computations. 38 + * 39 + * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the 40 + * bottom of the loop, even though the zero flag is still valid from the 41 + * subtract (since the intervening mov instructions don't alter the flags). 42 + * Gcc 8.1.0 doesn't have that problem. 43 + */ 44 + static void swap_words_32(void *a, void *b, size_t n) 45 + { 46 + do { 47 + u32 t = *(u32 *)(a + (n -= 4)); 48 + *(u32 *)(a + n) = *(u32 *)(b + n); 49 + *(u32 *)(b + n) = t; 50 + } while (n); 51 + } 52 + 53 + /** 54 + * swap_words_64 - swap two elements in 64-bit chunks 55 + * @a: pointer to the first element to swap 56 + * @b: pointer to the second element to swap 57 + * @n: element size (must be a multiple of 8) 58 + * 59 + * Exchange the two objects in memory. This exploits base+index 60 + * addressing, which basically all CPUs have, to minimize loop overhead 61 + * computations. 62 + * 63 + * We'd like to use 64-bit loads if possible. If they're not, emulating 64 + * one requires base+index+4 addressing which x86 has but most other 65 + * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads, 66 + * but it's possible to have 64-bit loads without 64-bit pointers (e.g. 67 + * x32 ABI). Are there any cases the kernel needs to worry about? 68 + */ 69 + static void swap_words_64(void *a, void *b, size_t n) 70 + { 71 + do { 72 + #ifdef CONFIG_64BIT 73 + u64 t = *(u64 *)(a + (n -= 8)); 74 + *(u64 *)(a + n) = *(u64 *)(b + n); 75 + *(u64 *)(b + n) = t; 76 + #else 77 + /* Use two 32-bit transfers to avoid base+index+4 addressing */ 78 + u32 t = *(u32 *)(a + (n -= 4)); 79 + *(u32 *)(a + n) = *(u32 *)(b + n); 80 + *(u32 *)(b + n) = t; 81 + 82 + t = *(u32 *)(a + (n -= 4)); 83 + *(u32 *)(a + n) = *(u32 *)(b + n); 84 + *(u32 *)(b + n) = t; 85 + #endif 86 + } while (n); 87 + } 88 + 89 + /** 90 + * swap_bytes - swap two elements a byte at a time 91 + * @a: pointer to the first element to swap 92 + * @b: pointer to the second element to swap 93 + * @n: element size 94 + * 95 + * This is the fallback if alignment doesn't allow using larger chunks. 96 + */ 97 + static void swap_bytes(void *a, void *b, size_t n) 98 + { 99 + do { 100 + char t = ((char *)a)[--n]; 101 + ((char *)a)[n] = ((char *)b)[n]; 102 + ((char *)b)[n] = t; 103 + } while (n); 104 + } 105 + 106 + /* 107 + * The values are arbitrary as long as they can't be confused with 108 + * a pointer, but small integers make for the smallest compare 109 + * instructions. 110 + */ 111 + #define SWAP_WORDS_64 (swap_r_func_t)0 112 + #define SWAP_WORDS_32 (swap_r_func_t)1 113 + #define SWAP_BYTES (swap_r_func_t)2 114 + #define SWAP_WRAPPER (swap_r_func_t)3 115 + 116 + struct wrapper { 117 + cmp_func_t cmp; 118 + swap_func_t swap; 119 + }; 120 + 121 + /* 122 + * The function pointer is last to make tail calls most efficient if the 123 + * compiler decides not to inline this function. 124 + */ 125 + static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv) 126 + { 127 + if (swap_func == SWAP_WRAPPER) { 128 + ((const struct wrapper *)priv)->swap(a, b, (int)size); 129 + return; 130 + } 131 + 132 + if (swap_func == SWAP_WORDS_64) 133 + swap_words_64(a, b, size); 134 + else if (swap_func == SWAP_WORDS_32) 135 + swap_words_32(a, b, size); 136 + else if (swap_func == SWAP_BYTES) 137 + swap_bytes(a, b, size); 138 + else 139 + swap_func(a, b, (int)size, priv); 140 + } 141 + 142 + #define _CMP_WRAPPER ((cmp_r_func_t)0L) 143 + 144 + static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv) 145 + { 146 + if (cmp == _CMP_WRAPPER) 147 + return ((const struct wrapper *)priv)->cmp(a, b); 148 + return cmp(a, b, priv); 149 + } 150 + 151 + static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size, 152 + cmp_r_func_t cmp_func, const void *priv, 153 + size_t l, size_t r) 154 + { 155 + return do_cmp(base + inorder_to_eytzinger0(l, n) * size, 156 + base + inorder_to_eytzinger0(r, n) * size, 157 + cmp_func, priv); 158 + } 159 + 160 + static inline void eytzinger0_do_swap(void *base, size_t n, size_t size, 161 + swap_r_func_t swap_func, const void *priv, 162 + size_t l, size_t r) 163 + { 164 + do_swap(base + inorder_to_eytzinger0(l, n) * size, 165 + base + inorder_to_eytzinger0(r, n) * size, 166 + size, swap_func, priv); 167 + } 168 + 169 + void eytzinger0_sort_r(void *base, size_t n, size_t size, 170 + cmp_r_func_t cmp_func, 171 + swap_r_func_t swap_func, 172 + const void *priv) 173 + { 174 + int i, c, r; 175 + 176 + /* called from 'sort' without swap function, let's pick the default */ 177 + if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap) 178 + swap_func = NULL; 179 + 180 + if (!swap_func) { 181 + if (is_aligned(base, size, 8)) 182 + swap_func = SWAP_WORDS_64; 183 + else if (is_aligned(base, size, 4)) 184 + swap_func = SWAP_WORDS_32; 185 + else 186 + swap_func = SWAP_BYTES; 187 + } 188 + 189 + /* heapify */ 190 + for (i = n / 2 - 1; i >= 0; --i) { 191 + for (r = i; r * 2 + 1 < n; r = c) { 192 + c = r * 2 + 1; 193 + 194 + if (c + 1 < n && 195 + eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0) 196 + c++; 197 + 198 + if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0) 199 + break; 200 + 201 + eytzinger0_do_swap(base, n, size, swap_func, priv, r, c); 202 + } 203 + } 204 + 205 + /* sort */ 206 + for (i = n - 1; i > 0; --i) { 207 + eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i); 208 + 209 + for (r = 0; r * 2 + 1 < i; r = c) { 210 + c = r * 2 + 1; 211 + 212 + if (c + 1 < i && 213 + eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0) 214 + c++; 215 + 216 + if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0) 217 + break; 218 + 219 + eytzinger0_do_swap(base, n, size, swap_func, priv, r, c); 220 + } 221 + } 222 + } 223 + 224 + void eytzinger0_sort(void *base, size_t n, size_t size, 225 + cmp_func_t cmp_func, 226 + swap_func_t swap_func) 227 + { 228 + struct wrapper w = { 229 + .cmp = cmp_func, 230 + .swap = swap_func, 231 + }; 232 + 233 + return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); 234 + }
+37 -26
fs/bcachefs/eytzinger.h
··· 5 5 #include <linux/bitops.h> 6 6 #include <linux/log2.h> 7 7 8 - #include "util.h" 8 + #ifdef EYTZINGER_DEBUG 9 + #define EYTZINGER_BUG_ON(cond) BUG_ON(cond) 10 + #else 11 + #define EYTZINGER_BUG_ON(cond) 12 + #endif 9 13 10 14 /* 11 15 * Traversal for trees in eytzinger layout - a full binary tree layed out in an 12 - * array 13 - */ 14 - 15 - /* 16 - * One based indexing version: 16 + * array. 17 17 * 18 - * With one based indexing each level of the tree starts at a power of two - 19 - * good for cacheline alignment: 18 + * Consider using an eytzinger tree any time you would otherwise be doing binary 19 + * search over an array. Binary search is a worst case scenario for branch 20 + * prediction and prefetching, but in an eytzinger tree every node's children 21 + * are adjacent in memory, thus we can prefetch children before knowing the 22 + * result of the comparison, assuming multiple nodes fit on a cacheline. 23 + * 24 + * Two variants are provided, for one based indexing and zero based indexing. 25 + * 26 + * Zero based indexing is more convenient, but one based indexing has better 27 + * alignment and thus better performance because each new level of the tree 28 + * starts at a power of two, and thus if element 0 was cacheline aligned, each 29 + * new level will be as well. 20 30 */ 21 31 22 32 static inline unsigned eytzinger1_child(unsigned i, unsigned child) 23 33 { 24 - EBUG_ON(child > 1); 34 + EYTZINGER_BUG_ON(child > 1); 25 35 26 36 return (i << 1) + child; 27 37 } ··· 68 58 69 59 static inline unsigned eytzinger1_next(unsigned i, unsigned size) 70 60 { 71 - EBUG_ON(i > size); 61 + EYTZINGER_BUG_ON(i > size); 72 62 73 63 if (eytzinger1_right_child(i) <= size) { 74 64 i = eytzinger1_right_child(i); ··· 84 74 85 75 static inline unsigned eytzinger1_prev(unsigned i, unsigned size) 86 76 { 87 - EBUG_ON(i > size); 77 + EYTZINGER_BUG_ON(i > size); 88 78 89 79 if (eytzinger1_left_child(i) <= size) { 90 80 i = eytzinger1_left_child(i) + 1; ··· 111 101 unsigned shift = __fls(size) - b; 112 102 int s; 113 103 114 - EBUG_ON(!i || i > size); 104 + EYTZINGER_BUG_ON(!i || i > size); 115 105 116 106 i ^= 1U << b; 117 107 i <<= 1; ··· 136 126 unsigned shift; 137 127 int s; 138 128 139 - EBUG_ON(!i || i > size); 129 + EYTZINGER_BUG_ON(!i || i > size); 140 130 141 131 /* 142 132 * sign bit trick: ··· 174 164 175 165 static inline unsigned eytzinger0_child(unsigned i, unsigned child) 176 166 { 177 - EBUG_ON(child > 1); 167 + EYTZINGER_BUG_ON(child > 1); 178 168 179 169 return (i << 1) + 1 + child; 180 170 } ··· 241 231 (_i) != -1; \ 242 232 (_i) = eytzinger0_next((_i), (_size))) 243 233 244 - typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size); 245 - 246 234 /* return greatest node <= @search, or -1 if not found */ 247 235 static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, 248 - eytzinger_cmp_fn cmp, const void *search) 236 + cmp_func_t cmp, const void *search) 249 237 { 250 238 unsigned i, n = 0; 251 239 ··· 252 244 253 245 do { 254 246 i = n; 255 - n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0); 247 + n = eytzinger0_child(i, cmp(base + i * size, search) <= 0); 256 248 } while (n < nr); 257 249 258 250 if (n & 1) { 259 251 /* @i was greater than @search, return previous node: */ 260 - 261 - if (i == eytzinger0_first(nr)) 262 - return -1; 263 - 264 252 return eytzinger0_prev(i, nr); 265 253 } else { 266 254 return i; 267 255 } 256 + } 257 + 258 + static inline ssize_t eytzinger0_find_gt(void *base, size_t nr, size_t size, 259 + cmp_func_t cmp, const void *search) 260 + { 261 + ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); 262 + return eytzinger0_next(idx, size); 268 263 } 269 264 270 265 #define eytzinger0_find(base, nr, size, _cmp, search) \ ··· 280 269 int _res; \ 281 270 \ 282 271 while (_i < _nr && \ 283 - (_res = _cmp(_search, _base + _i * _size, _size))) \ 272 + (_res = _cmp(_search, _base + _i * _size))) \ 284 273 _i = eytzinger0_child(_i, _res > 0); \ 285 274 _i; \ 286 275 }) 287 276 288 - void eytzinger0_sort(void *, size_t, size_t, 289 - int (*cmp_func)(const void *, const void *, size_t), 290 - void (*swap_func)(void *, void *, size_t)); 277 + void eytzinger0_sort_r(void *, size_t, size_t, 278 + cmp_r_func_t, swap_r_func_t, const void *); 279 + void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t); 291 280 292 281 #endif /* _EYTZINGER_H */
+202 -25
fs/bcachefs/fsck.c
··· 63 63 u32 *snapshot, u64 *inum) 64 64 { 65 65 struct bch_subvolume s; 66 - int ret; 67 - 68 - ret = bch2_subvolume_get(trans, subvol, false, 0, &s); 66 + int ret = bch2_subvolume_get(trans, subvol, false, 0, &s); 69 67 70 68 *snapshot = le32_to_cpu(s.snapshot); 71 69 *inum = le64_to_cpu(s.inode); ··· 168 170 169 171 /* Get lost+found, create if it doesn't exist: */ 170 172 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, 171 - struct bch_inode_unpacked *lostfound) 173 + struct bch_inode_unpacked *lostfound, 174 + u64 reattaching_inum) 172 175 { 173 176 struct bch_fs *c = trans->c; 174 177 struct qstr lostfound_str = QSTR("lost+found"); ··· 184 185 return ret; 185 186 186 187 subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) }; 187 - u32 subvol_snapshot; 188 188 189 - ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol), 190 - &subvol_snapshot, &root_inum.inum); 191 - bch_err_msg(c, ret, "looking up root subvol"); 189 + struct bch_subvolume subvol; 190 + ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol), 191 + false, 0, &subvol); 192 + bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u", 193 + le32_to_cpu(st.master_subvol), snapshot); 192 194 if (ret) 193 195 return ret; 196 + 197 + if (!subvol.inode) { 198 + struct btree_iter iter; 199 + struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, 200 + BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)), 201 + 0, subvolume); 202 + ret = PTR_ERR_OR_ZERO(subvol); 203 + if (ret) 204 + return ret; 205 + 206 + subvol->v.inode = cpu_to_le64(reattaching_inum); 207 + bch2_trans_iter_exit(trans, &iter); 208 + } 209 + 210 + root_inum.inum = le64_to_cpu(subvol.inode); 194 211 195 212 struct bch_inode_unpacked root_inode; 196 213 struct bch_hash_info root_hash_info; 197 214 u32 root_inode_snapshot = snapshot; 198 215 ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot); 199 - bch_err_msg(c, ret, "looking up root inode"); 216 + bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", 217 + root_inum.inum, le32_to_cpu(st.master_subvol)); 200 218 if (ret) 201 219 return ret; 202 220 ··· 309 293 snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); 310 294 } 311 295 312 - ret = lookup_lostfound(trans, dirent_snapshot, &lostfound); 296 + ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum); 313 297 if (ret) 314 298 return ret; 315 299 ··· 378 362 ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot)); 379 363 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); 380 364 return ret; 365 + } 366 + 367 + static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum) 368 + { 369 + struct bch_fs *c = trans->c; 370 + 371 + if (!bch2_snapshot_is_leaf(c, snapshotid)) { 372 + bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); 373 + return -BCH_ERR_fsck_repair_unimplemented; 374 + } 375 + 376 + /* 377 + * If inum isn't set, that means we're being called from check_dirents, 378 + * not check_inodes - the root of this subvolume doesn't exist or we 379 + * would have found it there: 380 + */ 381 + if (!inum) { 382 + struct btree_iter inode_iter = {}; 383 + struct bch_inode_unpacked new_inode; 384 + u64 cpu = raw_smp_processor_id(); 385 + 386 + bch2_inode_init_early(c, &new_inode); 387 + bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); 388 + 389 + new_inode.bi_subvol = subvolid; 390 + 391 + int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: 392 + bch2_btree_iter_traverse(&inode_iter) ?: 393 + bch2_inode_write(trans, &inode_iter, &new_inode); 394 + bch2_trans_iter_exit(trans, &inode_iter); 395 + if (ret) 396 + return ret; 397 + 398 + inum = new_inode.bi_inum; 399 + } 400 + 401 + bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum); 402 + 403 + struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); 404 + int ret = PTR_ERR_OR_ZERO(new_subvol); 405 + if (ret) 406 + return ret; 407 + 408 + bkey_subvolume_init(&new_subvol->k_i); 409 + new_subvol->k.p.offset = subvolid; 410 + new_subvol->v.snapshot = cpu_to_le32(snapshotid); 411 + new_subvol->v.inode = cpu_to_le64(inum); 412 + ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0); 413 + if (ret) 414 + return ret; 415 + 416 + struct btree_iter iter; 417 + struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter, 418 + BTREE_ID_snapshots, POS(0, snapshotid), 419 + 0, snapshot); 420 + ret = PTR_ERR_OR_ZERO(s); 421 + bch_err_msg(c, ret, "getting snapshot %u", snapshotid); 422 + if (ret) 423 + return ret; 424 + 425 + u32 snapshot_tree = le32_to_cpu(s->v.tree); 426 + 427 + s->v.subvol = cpu_to_le32(subvolid); 428 + SET_BCH_SNAPSHOT_SUBVOL(&s->v, true); 429 + bch2_trans_iter_exit(trans, &iter); 430 + 431 + struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter, 432 + BTREE_ID_snapshot_trees, POS(0, snapshot_tree), 433 + 0, snapshot_tree); 434 + ret = PTR_ERR_OR_ZERO(st); 435 + bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree); 436 + if (ret) 437 + return ret; 438 + 439 + if (!st->v.master_subvol) 440 + st->v.master_subvol = cpu_to_le32(subvolid); 441 + 442 + bch2_trans_iter_exit(trans, &iter); 443 + return 0; 444 + } 445 + 446 + static int reconstruct_inode(struct btree_trans *trans, u32 snapshot, u64 inum, u64 size, unsigned mode) 447 + { 448 + struct bch_fs *c = trans->c; 449 + struct bch_inode_unpacked new_inode; 450 + 451 + bch2_inode_init_early(c, &new_inode); 452 + bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, mode|0755, 0, NULL); 453 + new_inode.bi_size = size; 454 + new_inode.bi_inum = inum; 455 + 456 + return __bch2_fsck_write_inode(trans, &new_inode, snapshot); 457 + } 458 + 459 + static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 inum) 460 + { 461 + struct btree_iter iter = {}; 462 + 463 + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); 464 + struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter); 465 + bch2_trans_iter_exit(trans, &iter); 466 + int ret = bkey_err(k); 467 + if (ret) 468 + return ret; 469 + 470 + return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG); 381 471 } 382 472 383 473 struct snapshots_seen_entry { ··· 1187 1065 if (ret && !bch2_err_matches(ret, ENOENT)) 1188 1066 goto err; 1189 1067 1068 + if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { 1069 + ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum); 1070 + goto do_update; 1071 + } 1072 + 1190 1073 if (fsck_err_on(ret, 1191 1074 c, inode_bi_subvol_missing, 1192 1075 "inode %llu:%u bi_subvol points to missing subvolume %u", ··· 1209 1082 do_update = true; 1210 1083 } 1211 1084 } 1212 - 1085 + do_update: 1213 1086 if (do_update) { 1214 1087 ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot); 1215 1088 bch_err_msg(c, ret, "in fsck updating inode"); ··· 1258 1131 i->count = count2; 1259 1132 1260 1133 if (i->count != count2) { 1261 - bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", 1262 - w->last_pos.inode, i->snapshot, i->count, count2); 1134 + bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", 1135 + w->last_pos.inode, i->snapshot, i->count, count2); 1263 1136 return -BCH_ERR_internal_fsck_err; 1264 1137 } 1265 1138 ··· 1562 1435 goto err; 1563 1436 1564 1437 if (k.k->type != KEY_TYPE_whiteout) { 1438 + if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { 1439 + ret = reconstruct_reg_inode(trans, k.k->p.snapshot, k.k->p.inode) ?: 1440 + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 1441 + if (ret) 1442 + goto err; 1443 + 1444 + inode->last_pos.inode--; 1445 + ret = -BCH_ERR_transaction_restart_nested; 1446 + goto err; 1447 + } 1448 + 1565 1449 if (fsck_err_on(!i, c, extent_in_missing_inode, 1566 1450 "extent in missing inode:\n %s", 1567 1451 (printbuf_reset(&buf), ··· 1725 1587 return count2; 1726 1588 1727 1589 if (i->count != count2) { 1728 - bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu", 1729 - i->count, count2); 1590 + bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", 1591 + w->last_pos.inode, i->snapshot, i->count, count2); 1730 1592 i->count = count2; 1731 1593 if (i->inode.bi_nlink == i->count) 1732 1594 continue; ··· 1923 1785 u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); 1924 1786 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); 1925 1787 u32 parent_snapshot; 1788 + u32 new_parent_subvol = 0; 1926 1789 u64 parent_inum; 1927 1790 struct printbuf buf = PRINTBUF; 1928 1791 int ret = 0; ··· 1931 1792 ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum); 1932 1793 if (ret && !bch2_err_matches(ret, ENOENT)) 1933 1794 return ret; 1795 + 1796 + if (ret || 1797 + (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) { 1798 + int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); 1799 + if (ret2 && !bch2_err_matches(ret, ENOENT)) 1800 + return ret2; 1801 + } 1802 + 1803 + if (ret && 1804 + !new_parent_subvol && 1805 + (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { 1806 + /* 1807 + * Couldn't find a subvol for dirent's snapshot - but we lost 1808 + * subvols, so we need to reconstruct: 1809 + */ 1810 + ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0); 1811 + if (ret) 1812 + return ret; 1813 + 1814 + parent_snapshot = d.k->p.snapshot; 1815 + } 1934 1816 1935 1817 if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol, 1936 1818 "dirent parent_subvol points to missing subvolume\n%s", ··· 1961 1801 "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", 1962 1802 parent_snapshot, 1963 1803 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { 1964 - u32 new_parent_subvol; 1965 - ret = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); 1966 - if (ret) 1967 - goto err; 1804 + if (!new_parent_subvol) { 1805 + bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); 1806 + return -BCH_ERR_fsck_repair_unimplemented; 1807 + } 1968 1808 1969 1809 struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); 1970 1810 ret = PTR_ERR_OR_ZERO(new_dirent); ··· 2010 1850 2011 1851 ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot); 2012 1852 if (ret && !bch2_err_matches(ret, ENOENT)) 2013 - return ret; 1853 + goto err; 2014 1854 2015 - if (fsck_err_on(parent_subvol != subvol_root.bi_parent_subvol, 1855 + if (ret) { 1856 + bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); 1857 + ret = -BCH_ERR_fsck_repair_unimplemented; 1858 + ret = 0; 1859 + goto err; 1860 + } 1861 + 1862 + if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol, 2016 1863 c, inode_bi_parent_wrong, 2017 1864 "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", 2018 1865 target_inum, ··· 2027 1860 subvol_root.bi_parent_subvol = parent_subvol; 2028 1861 ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot); 2029 1862 if (ret) 2030 - return ret; 1863 + goto err; 2031 1864 } 2032 1865 2033 1866 ret = check_dirent_target(trans, iter, d, &subvol_root, 2034 1867 target_snapshot); 2035 1868 if (ret) 2036 - return ret; 1869 + goto err; 2037 1870 out: 2038 1871 err: 2039 1872 fsck_err: ··· 2050 1883 struct snapshots_seen *s) 2051 1884 { 2052 1885 struct bch_fs *c = trans->c; 2053 - struct bkey_s_c_dirent d; 2054 1886 struct inode_walker_entry *i; 2055 1887 struct printbuf buf = PRINTBUF; 2056 1888 struct bpos equiv; ··· 2088 1922 *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); 2089 1923 dir->first_this_inode = false; 2090 1924 1925 + if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { 1926 + ret = reconstruct_inode(trans, k.k->p.snapshot, k.k->p.inode, 0, S_IFDIR) ?: 1927 + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 1928 + if (ret) 1929 + goto err; 1930 + 1931 + dir->last_pos.inode--; 1932 + ret = -BCH_ERR_transaction_restart_nested; 1933 + goto err; 1934 + } 1935 + 2091 1936 if (fsck_err_on(!i, c, dirent_in_missing_dir_inode, 2092 1937 "dirent in nonexisting directory:\n%s", 2093 1938 (printbuf_reset(&buf), ··· 2133 1956 if (k.k->type != KEY_TYPE_dirent) 2134 1957 goto out; 2135 1958 2136 - d = bkey_s_c_to_dirent(k); 1959 + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 2137 1960 2138 1961 if (d.v->d_type == DT_SUBVOL) { 2139 1962 ret = check_dirent_to_subvol(trans, iter, d);
+1 -2
fs/bcachefs/journal_seq_blacklist.c
··· 95 95 return ret ?: bch2_blacklist_table_initialize(c); 96 96 } 97 97 98 - static int journal_seq_blacklist_table_cmp(const void *_l, 99 - const void *_r, size_t size) 98 + static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r) 100 99 { 101 100 const struct journal_seq_blacklist_table_entry *l = _l; 102 101 const struct journal_seq_blacklist_table_entry *r = _r;
+1 -27
fs/bcachefs/mean_and_variance_test.c
··· 136 136 d, mean, stddev, weighted_mean, weighted_stddev); 137 137 } 138 138 139 - static void mean_and_variance_test_2(struct kunit *test) 140 - { 141 - s64 d[] = { 100, 10, 10, 10, 10, 10, 10 }; 142 - s64 mean[] = { 10, 10, 10, 10, 10, 10, 10 }; 143 - s64 stddev[] = { 9, 9, 9, 9, 9, 9, 9 }; 144 - s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 }; 145 - s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 }; 146 - 147 - do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, 148 - d, mean, stddev, weighted_mean, weighted_stddev); 149 - } 150 - 151 139 /* Test behaviour where we switch from one steady state to another: */ 152 - static void mean_and_variance_test_3(struct kunit *test) 140 + static void mean_and_variance_test_2(struct kunit *test) 153 141 { 154 142 s64 d[] = { 100, 100, 100, 100, 100 }; 155 143 s64 mean[] = { 22, 32, 40, 46, 50 }; 156 144 s64 stddev[] = { 32, 39, 42, 44, 45 }; 157 - s64 weighted_mean[] = { 32, 49, 61, 71, 78 }; 158 - s64 weighted_stddev[] = { 38, 44, 44, 41, 38 }; 159 - 160 - do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, 161 - d, mean, stddev, weighted_mean, weighted_stddev); 162 - } 163 - 164 - static void mean_and_variance_test_4(struct kunit *test) 165 - { 166 - s64 d[] = { 100, 100, 100, 100, 100 }; 167 - s64 mean[] = { 10, 11, 12, 13, 14 }; 168 - s64 stddev[] = { 9, 13, 15, 17, 19 }; 169 145 s64 weighted_mean[] = { 32, 49, 61, 71, 78 }; 170 146 s64 weighted_stddev[] = { 38, 44, 44, 41, 38 }; 171 147 ··· 206 230 KUNIT_CASE(mean_and_variance_weighted_advanced_test), 207 231 KUNIT_CASE(mean_and_variance_test_1), 208 232 KUNIT_CASE(mean_and_variance_test_2), 209 - KUNIT_CASE(mean_and_variance_test_3), 210 - KUNIT_CASE(mean_and_variance_test_4), 211 233 {} 212 234 }; 213 235
+2 -2
fs/bcachefs/opts.h
··· 368 368 OPT_STR_NOLIMIT(bch2_recovery_passes), \ 369 369 BCH2_NO_SB_OPT, 0, \ 370 370 NULL, "Exit recovery after specified pass") \ 371 - x(keep_journal, u8, \ 371 + x(retain_recovery_info, u8, \ 372 372 0, \ 373 373 OPT_BOOL(), \ 374 374 BCH2_NO_SB_OPT, false, \ 375 - NULL, "Don't free journal entries/keys after startup")\ 375 + NULL, "Don't free journal entries/keys, scanned btree nodes after startup")\ 376 376 x(read_entire_journal, u8, \ 377 377 0, \ 378 378 OPT_BOOL(), \
+68 -40
fs/bcachefs/recovery.c
··· 4 4 #include "alloc_background.h" 5 5 #include "bkey_buf.h" 6 6 #include "btree_journal_iter.h" 7 + #include "btree_node_scan.h" 7 8 #include "btree_update.h" 8 9 #include "btree_update_interior.h" 9 10 #include "btree_io.h" ··· 33 32 34 33 #define QSTR(n) { { { .len = strlen(n) } }, .name = n } 35 34 35 + void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) 36 + { 37 + u64 b = BIT_ULL(btree); 38 + 39 + if (!(c->sb.btrees_lost_data & b)) { 40 + bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree)); 41 + 42 + mutex_lock(&c->sb_lock); 43 + bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b); 44 + bch2_write_super(c); 45 + mutex_unlock(&c->sb_lock); 46 + } 47 + } 48 + 36 49 static bool btree_id_is_alloc(enum btree_id id) 37 50 { 38 51 switch (id) { ··· 62 47 } 63 48 64 49 /* for -o reconstruct_alloc: */ 65 - static void do_reconstruct_alloc(struct bch_fs *c) 50 + static void bch2_reconstruct_alloc(struct bch_fs *c) 66 51 { 67 52 bch2_journal_log_msg(c, "dropping alloc info"); 68 53 bch_info(c, "dropping and reconstructing all alloc info"); ··· 97 82 98 83 c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); 99 84 100 - struct journal_keys *keys = &c->journal_keys; 101 - size_t src, dst; 102 85 103 - move_gap(keys, keys->nr); 104 - 105 - for (src = 0, dst = 0; src < keys->nr; src++) 106 - if (!btree_id_is_alloc(keys->data[src].btree_id)) 107 - keys->data[dst++] = keys->data[src]; 108 - keys->nr = keys->gap = dst; 86 + bch2_shoot_down_journal_keys(c, BTREE_ID_alloc, 87 + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); 88 + bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers, 89 + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); 90 + bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard, 91 + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); 92 + bch2_shoot_down_journal_keys(c, BTREE_ID_freespace, 93 + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); 94 + bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens, 95 + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); 109 96 } 110 97 111 98 /* ··· 286 269 bch2_trans_put(trans); 287 270 trans = NULL; 288 271 289 - if (!c->opts.keep_journal && 272 + if (!c->opts.retain_recovery_info && 290 273 c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) 291 274 bch2_journal_keys_put_initial(c); 292 275 ··· 450 433 451 434 static int read_btree_roots(struct bch_fs *c) 452 435 { 453 - unsigned i; 454 436 int ret = 0; 455 437 456 - for (i = 0; i < btree_id_nr_alive(c); i++) { 438 + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { 457 439 struct btree_root *r = bch2_btree_id_root(c, i); 458 440 459 441 if (!r->alive) ··· 461 445 if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc) 462 446 continue; 463 447 464 - if (r->error) { 465 - __fsck_err(c, 466 - btree_id_is_alloc(i) 467 - ? FSCK_CAN_IGNORE : 0, 468 - btree_root_bkey_invalid, 469 - "invalid btree root %s", 470 - bch2_btree_id_str(i)); 471 - if (i == BTREE_ID_alloc) 448 + if (mustfix_fsck_err_on((ret = r->error), 449 + c, btree_root_bkey_invalid, 450 + "invalid btree root %s", 451 + bch2_btree_id_str(i)) || 452 + mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)), 453 + c, btree_root_read_error, 454 + "error reading btree root %s l=%u: %s", 455 + bch2_btree_id_str(i), r->level, bch2_err_str(ret))) { 456 + if (btree_id_is_alloc(i)) { 457 + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations); 458 + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info); 459 + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus); 460 + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); 461 + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs); 472 462 c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); 473 - } 463 + r->error = 0; 464 + } else if (!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) { 465 + bch_info(c, "will run btree node scan"); 466 + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes); 467 + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); 468 + } 474 469 475 - ret = bch2_btree_root_read(c, i, &r->key, r->level); 476 - if (ret) { 477 - fsck_err(c, 478 - btree_root_read_error, 479 - "error reading btree root %s", 480 - bch2_btree_id_str(i)); 481 - if (btree_id_is_alloc(i)) 482 - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); 483 470 ret = 0; 471 + bch2_btree_lost_data(c, i); 484 472 } 485 473 } 486 474 487 - for (i = 0; i < BTREE_ID_NR; i++) { 475 + for (unsigned i = 0; i < BTREE_ID_NR; i++) { 488 476 struct btree_root *r = bch2_btree_id_root(c, i); 489 477 490 - if (!r->b) { 478 + if (!r->b && !r->error) { 491 479 r->alive = false; 492 480 r->level = 0; 493 - bch2_btree_root_alloc(c, i); 481 + bch2_btree_root_alloc_fake(c, i, 0); 494 482 } 495 483 } 496 484 fsck_err: ··· 671 651 goto err; 672 652 } 673 653 674 - if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { 654 + if (!c->sb.clean || c->opts.fsck || c->opts.retain_recovery_info) { 675 655 struct genradix_iter iter; 676 656 struct journal_replay **i; 677 657 ··· 751 731 c->journal_replay_seq_end = blacklist_seq - 1; 752 732 753 733 if (c->opts.reconstruct_alloc) 754 - do_reconstruct_alloc(c); 734 + bch2_reconstruct_alloc(c); 755 735 756 736 zero_out_btree_mem_ptr(&c->journal_keys); 757 737 ··· 858 838 } 859 839 860 840 if (!test_bit(BCH_FS_error, &c->flags) && 861 - (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) || 862 - !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) { 863 - memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required)); 841 + !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) { 864 842 memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); 843 + write_sb = true; 844 + } 845 + 846 + if (c->opts.fsck && 847 + !test_bit(BCH_FS_error, &c->flags) && 848 + c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 && 849 + ext->btrees_lost_data) { 850 + ext->btrees_lost_data = 0; 865 851 write_sb = true; 866 852 } 867 853 ··· 909 883 out: 910 884 bch2_flush_fsck_errs(c); 911 885 912 - if (!c->opts.keep_journal && 913 - test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) 886 + if (!c->opts.retain_recovery_info) { 914 887 bch2_journal_keys_put_initial(c); 888 + bch2_find_btree_nodes_exit(&c->found_btree_nodes); 889 + } 915 890 kfree(clean); 916 891 917 892 if (!ret && ··· 938 911 int ret; 939 912 940 913 bch_notice(c, "initializing new filesystem"); 914 + set_bit(BCH_FS_new_fs, &c->flags); 941 915 942 916 mutex_lock(&c->sb_lock); 943 917 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); ··· 957 929 set_bit(BCH_FS_may_go_rw, &c->flags); 958 930 959 931 for (unsigned i = 0; i < BTREE_ID_NR; i++) 960 - bch2_btree_root_alloc(c, i); 932 + bch2_btree_root_alloc_fake(c, i, 0); 961 933 962 934 for_each_member_device(c, ca) 963 935 bch2_dev_usage_init(ca);
+2
fs/bcachefs/recovery.h
··· 2 2 #ifndef _BCACHEFS_RECOVERY_H 3 3 #define _BCACHEFS_RECOVERY_H 4 4 5 + void bch2_btree_lost_data(struct bch_fs *, enum btree_id); 6 + 5 7 int bch2_journal_replay(struct bch_fs *); 6 8 7 9 int bch2_fs_recovery(struct bch_fs *);
+33 -9
fs/bcachefs/recovery_passes.c
··· 4 4 #include "alloc_background.h" 5 5 #include "backpointers.h" 6 6 #include "btree_gc.h" 7 + #include "btree_node_scan.h" 7 8 #include "ec.h" 8 9 #include "fsck.h" 9 10 #include "inode.h" ··· 60 59 #undef x 61 60 }; 62 61 63 - u64 bch2_recovery_passes_to_stable(u64 v) 64 - { 65 - static const u8 map[] = { 62 + static const u8 passes_to_stable_map[] = { 66 63 #define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, 67 64 BCH_RECOVERY_PASSES() 68 65 #undef x 69 - }; 66 + }; 70 67 68 + static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) 69 + { 70 + return passes_to_stable_map[pass]; 71 + } 72 + 73 + u64 bch2_recovery_passes_to_stable(u64 v) 74 + { 71 75 u64 ret = 0; 72 - for (unsigned i = 0; i < ARRAY_SIZE(map); i++) 76 + for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) 73 77 if (v & BIT_ULL(i)) 74 - ret |= BIT_ULL(map[i]); 78 + ret |= BIT_ULL(passes_to_stable_map[i]); 75 79 return ret; 76 80 } 77 81 ··· 122 116 int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, 123 117 enum bch_recovery_pass pass) 124 118 { 125 - __le64 s = cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(pass))); 119 + enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); 126 120 127 121 mutex_lock(&c->sb_lock); 128 122 struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); 129 123 130 - if (!(ext->recovery_passes_required[0] & s)) { 131 - ext->recovery_passes_required[0] |= s; 124 + if (!test_bit_le64(s, ext->recovery_passes_required)) { 125 + __set_bit_le64(s, ext->recovery_passes_required); 132 126 bch2_write_super(c); 133 127 } 134 128 mutex_unlock(&c->sb_lock); 135 129 136 130 return bch2_run_explicit_recovery_pass(c, pass); 131 + } 132 + 133 + static void bch2_clear_recovery_pass_required(struct bch_fs *c, 134 + enum bch_recovery_pass pass) 135 + { 136 + enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); 137 + 138 + mutex_lock(&c->sb_lock); 139 + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); 140 + 141 + if (test_bit_le64(s, ext->recovery_passes_required)) { 142 + __clear_bit_le64(s, ext->recovery_passes_required); 143 + bch2_write_super(c); 144 + } 145 + mutex_unlock(&c->sb_lock); 137 146 } 138 147 139 148 u64 bch2_fsck_recovery_passes(void) ··· 238 217 } 239 218 240 219 c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); 220 + 221 + if (!test_bit(BCH_FS_error, &c->flags)) 222 + bch2_clear_recovery_pass_required(c, c->curr_recovery_pass); 241 223 242 224 c->curr_recovery_pass++; 243 225 }
+2
fs/bcachefs/recovery_passes_types.h
··· 13 13 * must never change: 14 14 */ 15 15 #define BCH_RECOVERY_PASSES() \ 16 + x(scan_for_btree_nodes, 37, 0) \ 16 17 x(check_topology, 4, 0) \ 17 18 x(alloc_read, 0, PASS_ALWAYS) \ 18 19 x(stripes_read, 1, PASS_ALWAYS) \ ··· 32 31 x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ 33 32 x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ 34 33 x(bucket_gens_init, 17, 0) \ 34 + x(reconstruct_snapshots, 38, 0) \ 35 35 x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ 36 36 x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ 37 37 x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
+12 -7
fs/bcachefs/replicas.c
··· 6 6 #include "replicas.h" 7 7 #include "super-io.h" 8 8 9 + #include <linux/sort.h> 10 + 9 11 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, 10 12 struct bch_replicas_cpu *); 11 13 12 14 /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ 13 - static int bch2_memcmp(const void *l, const void *r, size_t size) 15 + static int bch2_memcmp(const void *l, const void *r, const void *priv) 14 16 { 17 + size_t size = (size_t) priv; 15 18 return memcmp(l, r, size); 16 19 } 17 20 ··· 42 39 43 40 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) 44 41 { 45 - eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL); 42 + eytzinger0_sort_r(r->entries, r->nr, r->entry_size, 43 + bch2_memcmp, NULL, (void *)(size_t)r->entry_size); 46 44 } 47 45 48 46 static void bch2_replicas_entry_v0_to_text(struct printbuf *out, ··· 232 228 233 229 verify_replicas_entry(search); 234 230 235 - #define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) 231 + #define entry_cmp(_l, _r) memcmp(_l, _r, entry_size) 236 232 idx = eytzinger0_find(r->entries, r->nr, r->entry_size, 237 233 entry_cmp, search); 238 234 #undef entry_cmp ··· 828 824 { 829 825 unsigned i; 830 826 831 - sort_cmp_size(cpu_r->entries, 832 - cpu_r->nr, 833 - cpu_r->entry_size, 834 - bch2_memcmp, NULL); 827 + sort_r(cpu_r->entries, 828 + cpu_r->nr, 829 + cpu_r->entry_size, 830 + bch2_memcmp, NULL, 831 + (void *)(size_t)cpu_r->entry_size); 835 832 836 833 for (i = 0; i < cpu_r->nr; i++) { 837 834 struct bch_replicas_entry_v1 *e =
+4 -1
fs/bcachefs/sb-errors_types.h
··· 267 267 x(subvol_unreachable, 259) \ 268 268 x(btree_node_bkey_bad_u64s, 260) \ 269 269 x(btree_node_topology_empty_interior_node, 261) \ 270 - x(btree_ptr_v2_min_key_bad, 262) 270 + x(btree_ptr_v2_min_key_bad, 262) \ 271 + x(btree_root_unreadable_and_scan_found_nothing, 263) \ 272 + x(snapshot_node_missing, 264) \ 273 + x(dup_backpointer_to_bad_csum_extent, 265) 271 274 272 275 enum bch_sb_error_id { 273 276 #define x(t, n) BCH_FSCK_ERR_##t = n,
+171 -2
fs/bcachefs/snapshot.c
··· 8 8 #include "errcode.h" 9 9 #include "error.h" 10 10 #include "fs.h" 11 + #include "recovery_passes.h" 11 12 #include "snapshot.h" 12 13 13 14 #include <linux/random.h> ··· 575 574 u32 subvol_id; 576 575 577 576 ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); 577 + bch_err_fn(c, ret); 578 + 579 + if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */ 580 + ret = 0; 581 + goto err; 582 + } 583 + 578 584 if (ret) 579 585 goto err; 580 586 ··· 739 731 u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); 740 732 u32 real_depth; 741 733 struct printbuf buf = PRINTBUF; 742 - bool should_have_subvol; 743 734 u32 i, id; 744 735 int ret = 0; 745 736 ··· 784 777 } 785 778 } 786 779 787 - should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && 780 + bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && 788 781 !BCH_SNAPSHOT_DELETED(&s); 789 782 790 783 if (should_have_subvol) { ··· 882 875 BTREE_ITER_PREFETCH, k, 883 876 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 884 877 check_snapshot(trans, &iter, k))); 878 + bch_err_fn(c, ret); 879 + return ret; 880 + } 881 + 882 + static int check_snapshot_exists(struct btree_trans *trans, u32 id) 883 + { 884 + struct bch_fs *c = trans->c; 885 + 886 + if (bch2_snapshot_equiv(c, id)) 887 + return 0; 888 + 889 + u32 tree_id; 890 + int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); 891 + if (ret) 892 + return ret; 893 + 894 + struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot)); 895 + ret = PTR_ERR_OR_ZERO(snapshot); 896 + if (ret) 897 + return ret; 898 + 899 + bkey_snapshot_init(&snapshot->k_i); 900 + snapshot->k.p = POS(0, id); 901 + snapshot->v.tree = cpu_to_le32(tree_id); 902 + snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c)); 903 + 904 + return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: 905 + bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, 906 + bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?: 907 + bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i)); 908 + } 909 + 910 + /* Figure out which snapshot nodes belong in the same tree: */ 911 + struct snapshot_tree_reconstruct { 912 + enum btree_id btree; 913 + struct bpos cur_pos; 914 + snapshot_id_list cur_ids; 915 + DARRAY(snapshot_id_list) trees; 916 + }; 917 + 918 + static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r) 919 + { 920 + darray_for_each(r->trees, i) 921 + darray_exit(i); 922 + darray_exit(&r->trees); 923 + darray_exit(&r->cur_ids); 924 + } 925 + 926 + static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos) 927 + { 928 + return r->btree == BTREE_ID_inodes 929 + ? r->cur_pos.offset == pos.offset 930 + : r->cur_pos.inode == pos.inode; 931 + } 932 + 933 + static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r) 934 + { 935 + darray_for_each(*l, i) 936 + if (snapshot_list_has_id(r, *i)) 937 + return true; 938 + return false; 939 + } 940 + 941 + static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s) 942 + { 943 + bool first = true; 944 + darray_for_each(*s, i) { 945 + if (!first) 946 + prt_char(out, ' '); 947 + first = false; 948 + prt_printf(out, "%u", *i); 949 + } 950 + } 951 + 952 + static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r) 953 + { 954 + if (r->cur_ids.nr) { 955 + darray_for_each(r->trees, i) 956 + if (snapshot_id_lists_have_common(i, &r->cur_ids)) { 957 + int ret = snapshot_list_merge(c, i, &r->cur_ids); 958 + if (ret) 959 + return ret; 960 + goto out; 961 + } 962 + darray_push(&r->trees, r->cur_ids); 963 + darray_init(&r->cur_ids); 964 + } 965 + out: 966 + r->cur_ids.nr = 0; 967 + return 0; 968 + } 969 + 970 + static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos) 971 + { 972 + if (!same_snapshot(r, pos)) 973 + snapshot_tree_reconstruct_next(c, r); 974 + r->cur_pos = pos; 975 + return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot); 976 + } 977 + 978 + int bch2_reconstruct_snapshots(struct bch_fs *c) 979 + { 980 + struct btree_trans *trans = bch2_trans_get(c); 981 + struct printbuf buf = PRINTBUF; 982 + struct snapshot_tree_reconstruct r = {}; 983 + int ret = 0; 984 + 985 + for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { 986 + if (btree_type_has_snapshots(btree)) { 987 + r.btree = btree; 988 + 989 + ret = for_each_btree_key(trans, iter, btree, POS_MIN, 990 + BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_PREFETCH, k, ({ 991 + get_snapshot_trees(c, &r, k.k->p); 992 + })); 993 + if (ret) 994 + goto err; 995 + 996 + snapshot_tree_reconstruct_next(c, &r); 997 + } 998 + } 999 + 1000 + darray_for_each(r.trees, t) { 1001 + printbuf_reset(&buf); 1002 + snapshot_id_list_to_text(&buf, t); 1003 + 1004 + darray_for_each(*t, id) { 1005 + if (fsck_err_on(!bch2_snapshot_equiv(c, *id), 1006 + c, snapshot_node_missing, 1007 + "snapshot node %u from tree %s missing", *id, buf.buf)) { 1008 + if (t->nr > 1) { 1009 + bch_err(c, "cannot reconstruct snapshot trees with multiple nodes"); 1010 + ret = -BCH_ERR_fsck_repair_unimplemented; 1011 + goto err; 1012 + } 1013 + 1014 + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1015 + check_snapshot_exists(trans, *id)); 1016 + if (ret) 1017 + goto err; 1018 + } 1019 + } 1020 + } 1021 + fsck_err: 1022 + err: 1023 + bch2_trans_put(trans); 1024 + snapshot_tree_reconstruct_exit(&r); 1025 + printbuf_exit(&buf); 885 1026 bch_err_fn(c, ret); 886 1027 return ret; 887 1028 } ··· 1844 1689 POS_MIN, 0, k, 1845 1690 (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); 1846 1691 bch_err_fn(c, ret); 1692 + 1693 + /* 1694 + * It's important that we check if we need to reconstruct snapshots 1695 + * before going RW, so we mark that pass as required in the superblock - 1696 + * otherwise, we could end up deleting keys with missing snapshot nodes 1697 + * instead 1698 + */ 1699 + BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) && 1700 + test_bit(BCH_FS_may_go_rw, &c->flags)); 1701 + 1702 + if (bch2_err_matches(ret, EIO) || 1703 + (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))) 1704 + ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots); 1705 + 1847 1706 return ret; 1848 1707 } 1849 1708
+23 -3
fs/bcachefs/snapshot.h
··· 209 209 210 210 static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id) 211 211 { 212 - int ret; 213 - 214 212 BUG_ON(snapshot_list_has_id(s, id)); 215 - ret = darray_push(s, id); 213 + int ret = darray_push(s, id); 216 214 if (ret) 217 215 bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); 218 216 return ret; 217 + } 218 + 219 + static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id) 220 + { 221 + int ret = snapshot_list_has_id(s, id) 222 + ? 0 223 + : darray_push(s, id); 224 + if (ret) 225 + bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); 226 + return ret; 227 + } 228 + 229 + static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src) 230 + { 231 + darray_for_each(*src, i) { 232 + int ret = snapshot_list_add_nodup(c, dst, *i); 233 + if (ret) 234 + return ret; 235 + } 236 + 237 + return 0; 219 238 } 220 239 221 240 int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, ··· 248 229 249 230 int bch2_check_snapshot_trees(struct bch_fs *); 250 231 int bch2_check_snapshots(struct bch_fs *); 232 + int bch2_reconstruct_snapshots(struct bch_fs *); 251 233 252 234 int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); 253 235 void bch2_delete_dead_snapshots_work(struct work_struct *);
+8 -1
fs/bcachefs/super-io.c
··· 527 527 memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent)); 528 528 529 529 struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext); 530 - if (ext) 530 + if (ext) { 531 531 le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent, 532 532 sizeof(c->sb.errors_silent) * 8); 533 + c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data); 534 + } 533 535 534 536 for_each_member_device(c, ca) { 535 537 struct bch_member m = bch2_sb_member_get(src, ca->dev_idx); ··· 1164 1162 1165 1163 kfree(errors_silent); 1166 1164 } 1165 + 1166 + prt_printf(out, "Btrees with missing data:"); 1167 + prt_tab(out); 1168 + prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data)); 1169 + prt_newline(out); 1167 1170 } 1168 1171 1169 1172 static const struct bch_sb_field_ops bch_sb_field_ops_ext = {
+3
fs/bcachefs/super.c
··· 15 15 #include "btree_gc.h" 16 16 #include "btree_journal_iter.h" 17 17 #include "btree_key_cache.h" 18 + #include "btree_node_scan.h" 18 19 #include "btree_update_interior.h" 19 20 #include "btree_io.h" 20 21 #include "btree_write_buffer.h" ··· 537 536 for (i = 0; i < BCH_TIME_STAT_NR; i++) 538 537 bch2_time_stats_exit(&c->times[i]); 539 538 539 + bch2_find_btree_nodes_exit(&c->found_btree_nodes); 540 540 bch2_free_pending_node_rewrites(c); 541 541 bch2_fs_sb_errors_exit(c); 542 542 bch2_fs_counters_exit(c); ··· 562 560 bch2_io_clock_exit(&c->io_clock[READ]); 563 561 bch2_fs_compress_exit(c); 564 562 bch2_journal_keys_put_initial(c); 563 + bch2_find_btree_nodes_exit(&c->found_btree_nodes); 565 564 BUG_ON(atomic_read(&c->journal_keys.ref)); 566 565 bch2_fs_btree_write_buffer_exit(c); 567 566 percpu_free_rwsem(&c->mark_lock);
-143
fs/bcachefs/util.c
··· 707 707 } 708 708 } 709 709 710 - static int alignment_ok(const void *base, size_t align) 711 - { 712 - return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || 713 - ((unsigned long)base & (align - 1)) == 0; 714 - } 715 - 716 - static void u32_swap(void *a, void *b, size_t size) 717 - { 718 - u32 t = *(u32 *)a; 719 - *(u32 *)a = *(u32 *)b; 720 - *(u32 *)b = t; 721 - } 722 - 723 - static void u64_swap(void *a, void *b, size_t size) 724 - { 725 - u64 t = *(u64 *)a; 726 - *(u64 *)a = *(u64 *)b; 727 - *(u64 *)b = t; 728 - } 729 - 730 - static void generic_swap(void *a, void *b, size_t size) 731 - { 732 - char t; 733 - 734 - do { 735 - t = *(char *)a; 736 - *(char *)a++ = *(char *)b; 737 - *(char *)b++ = t; 738 - } while (--size > 0); 739 - } 740 - 741 - static inline int do_cmp(void *base, size_t n, size_t size, 742 - int (*cmp_func)(const void *, const void *, size_t), 743 - size_t l, size_t r) 744 - { 745 - return cmp_func(base + inorder_to_eytzinger0(l, n) * size, 746 - base + inorder_to_eytzinger0(r, n) * size, 747 - size); 748 - } 749 - 750 - static inline void do_swap(void *base, size_t n, size_t size, 751 - void (*swap_func)(void *, void *, size_t), 752 - size_t l, size_t r) 753 - { 754 - swap_func(base + inorder_to_eytzinger0(l, n) * size, 755 - base + inorder_to_eytzinger0(r, n) * size, 756 - size); 757 - } 758 - 759 - void eytzinger0_sort(void *base, size_t n, size_t size, 760 - int (*cmp_func)(const void *, const void *, size_t), 761 - void (*swap_func)(void *, void *, size_t)) 762 - { 763 - int i, c, r; 764 - 765 - if (!swap_func) { 766 - if (size == 4 && alignment_ok(base, 4)) 767 - swap_func = u32_swap; 768 - else if (size == 8 && alignment_ok(base, 8)) 769 - swap_func = u64_swap; 770 - else 771 - swap_func = generic_swap; 772 - } 773 - 774 - /* heapify */ 775 - for (i = n / 2 - 1; i >= 0; --i) { 776 - for (r = i; r * 2 + 1 < n; r = c) { 777 - c = r * 2 + 1; 778 - 779 - if (c + 1 < n && 780 - do_cmp(base, n, size, cmp_func, c, c + 1) < 0) 781 - c++; 782 - 783 - if (do_cmp(base, n, size, cmp_func, r, c) >= 0) 784 - break; 785 - 786 - do_swap(base, n, size, swap_func, r, c); 787 - } 788 - } 789 - 790 - /* sort */ 791 - for (i = n - 1; i > 0; --i) { 792 - do_swap(base, n, size, swap_func, 0, i); 793 - 794 - for (r = 0; r * 2 + 1 < i; r = c) { 795 - c = r * 2 + 1; 796 - 797 - if (c + 1 < i && 798 - do_cmp(base, n, size, cmp_func, c, c + 1) < 0) 799 - c++; 800 - 801 - if (do_cmp(base, n, size, cmp_func, r, c) >= 0) 802 - break; 803 - 804 - do_swap(base, n, size, swap_func, r, c); 805 - } 806 - } 807 - } 808 - 809 - void sort_cmp_size(void *base, size_t num, size_t size, 810 - int (*cmp_func)(const void *, const void *, size_t), 811 - void (*swap_func)(void *, void *, size_t size)) 812 - { 813 - /* pre-scale counters for performance */ 814 - int i = (num/2 - 1) * size, n = num * size, c, r; 815 - 816 - if (!swap_func) { 817 - if (size == 4 && alignment_ok(base, 4)) 818 - swap_func = u32_swap; 819 - else if (size == 8 && alignment_ok(base, 8)) 820 - swap_func = u64_swap; 821 - else 822 - swap_func = generic_swap; 823 - } 824 - 825 - /* heapify */ 826 - for ( ; i >= 0; i -= size) { 827 - for (r = i; r * 2 + size < n; r = c) { 828 - c = r * 2 + size; 829 - if (c < n - size && 830 - cmp_func(base + c, base + c + size, size) < 0) 831 - c += size; 832 - if (cmp_func(base + r, base + c, size) >= 0) 833 - break; 834 - swap_func(base + r, base + c, size); 835 - } 836 - } 837 - 838 - /* sort */ 839 - for (i = n - size; i > 0; i -= size) { 840 - swap_func(base, base + i, size); 841 - for (r = 0; r * 2 + size < i; r = c) { 842 - c = r * 2 + size; 843 - if (c < i - size && 844 - cmp_func(base + c, base + c + size, size) < 0) 845 - c += size; 846 - if (cmp_func(base + r, base + c, size) >= 0) 847 - break; 848 - swap_func(base + r, base + c, size); 849 - } 850 - } 851 - } 852 - 853 710 #if 0 854 711 void eytzinger1_test(void) 855 712 {
+10 -4
fs/bcachefs/util.h
··· 631 631 memset(s + bytes, c, rem); 632 632 } 633 633 634 - void sort_cmp_size(void *base, size_t num, size_t size, 635 - int (*cmp_func)(const void *, const void *, size_t), 636 - void (*swap_func)(void *, void *, size_t)); 637 - 638 634 /* just the memmove, doesn't update @_nr */ 639 635 #define __array_insert_item(_array, _nr, _pos) \ 640 636 memmove(&(_array)[(_pos) + 1], \ ··· 791 795 static inline void __set_bit_le64(size_t bit, __le64 *addr) 792 796 { 793 797 addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64)); 798 + } 799 + 800 + static inline void __clear_bit_le64(size_t bit, __le64 *addr) 801 + { 802 + addr[bit / 64] &= !cpu_to_le64(BIT_ULL(bit % 64)); 803 + } 804 + 805 + static inline bool test_bit_le64(size_t bit, __le64 *addr) 806 + { 807 + return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0; 794 808 } 795 809 796 810 #endif /* _BCACHEFS_UTIL_H */