Merge tag 'bcachefs-2024-10-14' of git://evilpiepirate.org/bcachefs

+10

fs/bcachefs/alloc_background.c

··· 639 639 continue; 640 640 } 641 641 642 + if (k.k->p.offset < ca->mi.first_bucket) { 643 + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket)); 644 + continue; 645 + } 646 + 647 + if (k.k->p.offset >= ca->mi.nbuckets) { 648 + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); 649 + continue; 650 + } 651 + 642 652 struct bch_alloc_v4 a; 643 653 *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; 644 654 0;

+2 -1

fs/bcachefs/bcachefs_format.h

··· 678 678 x(disk_accounting_v2, BCH_VERSION(1, 9)) \ 679 679 x(disk_accounting_v3, BCH_VERSION(1, 10)) \ 680 680 x(disk_accounting_inum, BCH_VERSION(1, 11)) \ 681 - x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) 681 + x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \ 682 + x(inode_has_child_snapshots, BCH_VERSION(1, 13)) 682 683 683 684 enum bcachefs_metadata_version { 684 685 bcachefs_metadata_version_min = 9,

+9 -6

fs/bcachefs/btree_gc.c

··· 1224 1224 u64 b, start_time = local_clock(); 1225 1225 int ret; 1226 1226 1227 - /* 1228 - * Ideally we would be using state_lock and not gc_gens_lock here, but that 1229 - * introduces a deadlock in the RO path - we currently take the state 1230 - * lock at the start of going RO, thus the gc thread may get stuck: 1231 - */ 1232 1227 if (!mutex_trylock(&c->gc_gens_lock)) 1233 1228 return 0; 1234 1229 1235 1230 trace_and_count(c, gc_gens_start, c); 1236 1231 1237 - down_read(&c->state_lock); 1232 + /* 1233 + * We have to use trylock here. Otherwise, we would 1234 + * introduce a deadlock in the RO path - we take the 1235 + * state lock at the start of going RO. 1236 + */ 1237 + if (!down_read_trylock(&c->state_lock)) { 1238 + mutex_unlock(&c->gc_gens_lock); 1239 + return 0; 1240 + } 1238 1241 1239 1242 for_each_member_device(c, ca) { 1240 1243 struct bucket_gens *gens = bucket_gens(ca);

+3 -2

fs/bcachefs/btree_io.c

··· 1838 1838 struct btree_trans *trans = bch2_trans_get(c); 1839 1839 1840 1840 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); 1841 + 1842 + /* we don't need transaction context anymore after we got the lock. */ 1843 + bch2_trans_put(trans); 1841 1844 __btree_node_write_done(c, b); 1842 1845 six_unlock_read(&b->c.lock); 1843 - 1844 - bch2_trans_put(trans); 1845 1846 } 1846 1847 1847 1848 static void btree_node_write_work(struct work_struct *work)

+3 -3

fs/bcachefs/btree_iter.c

··· 2381 2381 else 2382 2382 iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k)); 2383 2383 2384 - if (unlikely(!(iter->flags & BTREE_ITER_is_extents) 2385 - ? bkey_gt(iter_pos, end) 2386 - : bkey_ge(iter_pos, end))) 2384 + if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(iter_pos, end) : 2385 + iter->flags & BTREE_ITER_is_extents ? bkey_ge(iter_pos, end) : 2386 + bkey_gt(iter_pos, end))) 2387 2387 goto end; 2388 2388 2389 2389 break;

+8

fs/bcachefs/btree_iter.h

··· 857 857 for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\ 858 858 SPOS_MAX, _flags, _k, _ret) 859 859 860 + #define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ 861 + _start, _flags, _k, _ret) \ 862 + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ 863 + (_start), (_flags)); \ 864 + (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \ 865 + !((_ret) = bkey_err(_k)) && (_k).k; \ 866 + bch2_btree_iter_rewind(&(_iter))) 867 + 860 868 #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ 861 869 for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) 862 870

+3

fs/bcachefs/btree_node_scan.c

··· 171 171 if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH) 172 172 return; 173 173 174 + if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX) 175 + return; 176 + 174 177 rcu_read_lock(); 175 178 struct found_btree_node n = { 176 179 .btree_id = BTREE_NODE_ID(bn),

+1

fs/bcachefs/data_update.c

··· 80 80 if (ptr2 == ptr) 81 81 break; 82 82 83 + ca = bch2_dev_have_ref(c, ptr2->dev); 83 84 bucket = PTR_BUCKET_POS(ca, ptr2); 84 85 bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); 85 86 }

+115 -36

fs/bcachefs/disk_accounting.c

··· 242 242 *p = swab64(*p); 243 243 } 244 244 245 + static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r, 246 + struct disk_accounting_pos acc) 247 + { 248 + unsafe_memcpy(r, &acc.replicas, 249 + replicas_entry_bytes(&acc.replicas), 250 + "variable length struct"); 251 + } 252 + 245 253 static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p) 246 254 { 247 255 struct disk_accounting_pos acc_k; ··· 257 249 258 250 switch (acc_k.type) { 259 251 case BCH_DISK_ACCOUNTING_replicas: 260 - unsafe_memcpy(r, &acc_k.replicas, 261 - replicas_entry_bytes(&acc_k.replicas), 262 - "variable length struct"); 252 + __accounting_to_replicas(r, acc_k); 263 253 return true; 264 254 default: 265 255 return false; ··· 614 608 return ret; 615 609 } 616 610 611 + static int bch2_disk_accounting_validate_late(struct btree_trans *trans, 612 + struct disk_accounting_pos acc, 613 + u64 *v, unsigned nr) 614 + { 615 + struct bch_fs *c = trans->c; 616 + struct printbuf buf = PRINTBUF; 617 + int ret = 0, invalid_dev = -1; 618 + 619 + switch (acc.type) { 620 + case BCH_DISK_ACCOUNTING_replicas: { 621 + struct bch_replicas_padded r; 622 + __accounting_to_replicas(&r.e, acc); 623 + 624 + for (unsigned i = 0; i < r.e.nr_devs; i++) 625 + if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && 626 + !bch2_dev_exists(c, r.e.devs[i])) { 627 + invalid_dev = r.e.devs[i]; 628 + goto invalid_device; 629 + } 630 + 631 + /* 632 + * All replicas entry checks except for invalid device are done 633 + * in bch2_accounting_validate 634 + */ 635 + BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf)); 636 + 637 + if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), 638 + trans, accounting_replicas_not_marked, 639 + "accounting not marked in superblock replicas\n %s", 640 + (printbuf_reset(&buf), 641 + bch2_accounting_key_to_text(&buf, &acc), 642 + buf.buf))) { 643 + /* 644 + * We're not RW yet and still single threaded, dropping 645 + * and retaking lock is ok: 646 + */ 647 + percpu_up_write(&c->mark_lock); 648 + ret = bch2_mark_replicas(c, &r.e); 649 + if (ret) 650 + goto fsck_err; 651 + percpu_down_write(&c->mark_lock); 652 + } 653 + break; 654 + } 655 + 656 + case BCH_DISK_ACCOUNTING_dev_data_type: 657 + if (!bch2_dev_exists(c, acc.dev_data_type.dev)) { 658 + invalid_dev = acc.dev_data_type.dev; 659 + goto invalid_device; 660 + } 661 + break; 662 + } 663 + 664 + fsck_err: 665 + printbuf_exit(&buf); 666 + return ret; 667 + invalid_device: 668 + if (fsck_err(trans, accounting_to_invalid_device, 669 + "accounting entry points to invalid device %i\n %s", 670 + invalid_dev, 671 + (printbuf_reset(&buf), 672 + bch2_accounting_key_to_text(&buf, &acc), 673 + buf.buf))) { 674 + for (unsigned i = 0; i < nr; i++) 675 + v[i] = -v[i]; 676 + 677 + ret = commit_do(trans, NULL, NULL, 0, 678 + bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?: 679 + -BCH_ERR_remove_disk_accounting_entry; 680 + } else { 681 + ret = -BCH_ERR_remove_disk_accounting_entry; 682 + } 683 + goto fsck_err; 684 + } 685 + 617 686 /* 618 687 * At startup time, initialize the in memory accounting from the btree (and 619 688 * journal) ··· 747 666 } 748 667 keys->gap = keys->nr = dst - keys->data; 749 668 750 - percpu_down_read(&c->mark_lock); 751 - for (unsigned i = 0; i < acc->k.nr; i++) { 669 + percpu_down_write(&c->mark_lock); 670 + unsigned i = 0; 671 + while (i < acc->k.nr) { 672 + unsigned idx = inorder_to_eytzinger0(i, acc->k.nr); 673 + 674 + struct disk_accounting_pos acc_k; 675 + bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos); 676 + 752 677 u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; 753 - bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); 754 - 755 - if (bch2_is_zero(v, sizeof(v[0]) * acc->k.data[i].nr_counters)) 756 - continue; 757 - 758 - struct bch_replicas_padded r; 759 - if (!accounting_to_replicas(&r.e, acc->k.data[i].pos)) 760 - continue; 678 + bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false); 761 679 762 680 /* 763 - * If the replicas entry is invalid it'll get cleaned up by 764 - * check_allocations: 681 + * If the entry counters are zeroed, it should be treated as 682 + * nonexistent - it might point to an invalid device. 683 + * 684 + * Remove it, so that if it's re-added it gets re-marked in the 685 + * superblock: 765 686 */ 766 - if (bch2_replicas_entry_validate(&r.e, c, &buf)) 687 + ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters) 688 + ? -BCH_ERR_remove_disk_accounting_entry 689 + : bch2_disk_accounting_validate_late(trans, acc_k, 690 + v, acc->k.data[idx].nr_counters); 691 + 692 + if (ret == -BCH_ERR_remove_disk_accounting_entry) { 693 + free_percpu(acc->k.data[idx].v[0]); 694 + free_percpu(acc->k.data[idx].v[1]); 695 + darray_remove_item(&acc->k, &acc->k.data[idx]); 696 + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), 697 + accounting_pos_cmp, NULL); 698 + ret = 0; 767 699 continue; 768 - 769 - struct disk_accounting_pos k; 770 - bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); 771 - 772 - if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), 773 - trans, accounting_replicas_not_marked, 774 - "accounting not marked in superblock replicas\n %s", 775 - (printbuf_reset(&buf), 776 - bch2_accounting_key_to_text(&buf, &k), 777 - buf.buf))) { 778 - /* 779 - * We're not RW yet and still single threaded, dropping 780 - * and retaking lock is ok: 781 - */ 782 - percpu_up_read(&c->mark_lock); 783 - ret = bch2_mark_replicas(c, &r.e); 784 - if (ret) 785 - goto fsck_err; 786 - percpu_down_read(&c->mark_lock); 787 700 } 701 + 702 + if (ret) 703 + goto fsck_err; 704 + i++; 788 705 } 789 706 790 707 preempt_disable(); ··· 821 742 } 822 743 preempt_enable(); 823 744 fsck_err: 824 - percpu_up_read(&c->mark_lock); 745 + percpu_up_write(&c->mark_lock); 825 746 err: 826 747 printbuf_exit(&buf); 827 748 bch2_trans_put(trans);

+64 -30

fs/bcachefs/ec.c

··· 124 124 "incorrect value size (%zu < %u)", 125 125 bkey_val_u64s(k.k), stripe_val_u64s(s)); 126 126 127 + bkey_fsck_err_on(s->csum_granularity_bits >= 64, 128 + c, stripe_csum_granularity_bad, 129 + "invalid csum granularity (%u >= 64)", 130 + s->csum_granularity_bits); 131 + 127 132 ret = bch2_bkey_ptrs_validate(c, k, flags); 128 133 fsck_err: 129 134 return ret; ··· 150 145 nr_data, 151 146 s.nr_redundant); 152 147 bch2_prt_csum_type(out, s.csum_type); 153 - prt_printf(out, " gran %u", 1U << s.csum_granularity_bits); 148 + prt_str(out, " gran "); 149 + if (s.csum_granularity_bits < 64) 150 + prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits); 151 + else 152 + prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits); 154 153 155 154 if (s.disk_label) { 156 155 prt_str(out, " label"); ··· 1206 1197 /* stripe creation: */ 1207 1198 1208 1199 static int ec_stripe_key_update(struct btree_trans *trans, 1209 - struct bkey_i_stripe *new, 1210 - bool create) 1200 + struct bkey_i_stripe *old, 1201 + struct bkey_i_stripe *new) 1211 1202 { 1212 1203 struct bch_fs *c = trans->c; 1213 - struct btree_iter iter; 1214 - struct bkey_s_c k; 1215 - int ret; 1204 + bool create = !old; 1216 1205 1217 - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, 1218 - new->k.p, BTREE_ITER_intent); 1219 - ret = bkey_err(k); 1206 + struct btree_iter iter; 1207 + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, 1208 + new->k.p, BTREE_ITER_intent); 1209 + int ret = bkey_err(k); 1220 1210 if (ret) 1221 1211 goto err; 1222 1212 1223 - if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) { 1224 - bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s", 1225 - create ? "creating" : "updating", 1226 - bch2_bkey_types[k.k->type]); 1213 + if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe), 1214 + c, "error %s stripe: got existing key type %s", 1215 + create ? "creating" : "updating", 1216 + bch2_bkey_types[k.k->type])) { 1227 1217 ret = -EINVAL; 1228 1218 goto err; 1229 1219 } 1230 1220 1231 1221 if (k.k->type == KEY_TYPE_stripe) { 1232 - const struct bch_stripe *old = bkey_s_c_to_stripe(k).v; 1233 - unsigned i; 1222 + const struct bch_stripe *v = bkey_s_c_to_stripe(k).v; 1234 1223 1235 - if (old->nr_blocks != new->v.nr_blocks) { 1236 - bch_err(c, "error updating stripe: nr_blocks does not match"); 1237 - ret = -EINVAL; 1238 - goto err; 1239 - } 1224 + BUG_ON(old->v.nr_blocks != new->v.nr_blocks); 1225 + BUG_ON(old->v.nr_blocks != v->nr_blocks); 1240 1226 1241 - for (i = 0; i < new->v.nr_blocks; i++) { 1242 - unsigned v = stripe_blockcount_get(old, i); 1227 + for (unsigned i = 0; i < new->v.nr_blocks; i++) { 1228 + unsigned sectors = stripe_blockcount_get(v, i); 1243 1229 1244 - BUG_ON(v && 1245 - (old->ptrs[i].dev != new->v.ptrs[i].dev || 1246 - old->ptrs[i].gen != new->v.ptrs[i].gen || 1247 - old->ptrs[i].offset != new->v.ptrs[i].offset)); 1230 + if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) { 1231 + struct printbuf buf = PRINTBUF; 1248 1232 1249 - stripe_blockcount_set(&new->v, i, v); 1233 + prt_printf(&buf, "stripe changed nonempty block %u", i); 1234 + prt_str(&buf, "\nold: "); 1235 + bch2_bkey_val_to_text(&buf, c, k); 1236 + prt_str(&buf, "\nnew: "); 1237 + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i)); 1238 + bch2_fs_inconsistent(c, "%s", buf.buf); 1239 + printbuf_exit(&buf); 1240 + ret = -EINVAL; 1241 + goto err; 1242 + } 1243 + 1244 + /* 1245 + * If the stripe ptr changed underneath us, it must have 1246 + * been dev_remove_stripes() -> * invalidate_stripe_to_dev() 1247 + */ 1248 + if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) { 1249 + BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID); 1250 + 1251 + if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i])) 1252 + new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID; 1253 + } 1254 + 1255 + stripe_blockcount_set(&new->v, i, sectors); 1250 1256 } 1251 1257 } 1252 1258 ··· 1523 1499 BCH_TRANS_COMMIT_no_check_rw| 1524 1500 BCH_TRANS_COMMIT_no_enospc, 1525 1501 ec_stripe_key_update(trans, 1526 - bkey_i_to_stripe(&s->new_stripe.key), 1527 - !s->have_existing_stripe)); 1502 + s->have_existing_stripe 1503 + ? bkey_i_to_stripe(&s->existing_stripe.key) 1504 + : NULL, 1505 + bkey_i_to_stripe(&s->new_stripe.key))); 1528 1506 bch_err_msg(c, ret, "creating stripe key"); 1529 1507 if (ret) { 1530 1508 goto err; ··· 1902 1876 bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); 1903 1877 1904 1878 for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { 1905 - __clear_bit(v->ptrs[i].dev, devs.d); 1879 + /* 1880 + * Note: we don't yet repair invalid blocks (failed/removed 1881 + * devices) when reusing stripes - we still need a codepath to 1882 + * walk backpointers and update all extents that point to that 1883 + * block when updating the stripe 1884 + */ 1885 + if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) 1886 + __clear_bit(v->ptrs[i].dev, devs.d); 1887 + 1906 1888 if (i < h->s->nr_data) 1907 1889 nr_have_data++; 1908 1890 else

+2 -1

fs/bcachefs/errcode.h

··· 268 268 x(BCH_ERR_nopromote, nopromote_no_writes) \ 269 269 x(BCH_ERR_nopromote, nopromote_enomem) \ 270 270 x(0, invalid_snapshot_node) \ 271 - x(0, option_needs_open_fs) 271 + x(0, option_needs_open_fs) \ 272 + x(0, remove_disk_accounting_entry) 272 273 273 274 enum bch_errcode { 274 275 BCH_ERR_START = 2048,

+10

fs/bcachefs/extents.h

··· 695 695 int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c, 696 696 enum bch_validate_flags); 697 697 698 + static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, 699 + struct bch_extent_ptr ptr2) 700 + { 701 + return (ptr1.cached == ptr2.cached && 702 + ptr1.unwritten == ptr2.unwritten && 703 + ptr1.offset == ptr2.offset && 704 + ptr1.dev == ptr2.dev && 705 + ptr1.dev == ptr2.dev); 706 + } 707 + 698 708 void bch2_ptr_swab(struct bkey_s); 699 709 700 710 const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);

+2 -1

fs/bcachefs/fs-io-direct.c

··· 369 369 370 370 static __always_inline long bch2_dio_write_done(struct dio_write *dio) 371 371 { 372 + struct bch_fs *c = dio->op.c; 372 373 struct kiocb *req = dio->req; 373 374 struct bch_inode_info *inode = dio->inode; 374 375 bool sync = dio->sync; ··· 388 387 ret = dio->op.error ?: ((long) dio->written << 9); 389 388 bio_put(&dio->op.wbio.bio); 390 389 391 - bch2_write_ref_put(dio->op.c, BCH_WRITE_REF_dio_write); 390 + bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); 392 391 393 392 /* inode->i_dio_count is our ref on inode and thus bch_fs */ 394 393 inode_dio_end(&inode->v);

+99 -3

fs/bcachefs/fs.c

··· 157 157 return a.subvol == b.subvol && a.inum == b.inum; 158 158 } 159 159 160 + static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) 161 + { 162 + const subvol_inum *inum = data; 163 + 164 + return jhash(&inum->inum, sizeof(inum->inum), seed); 165 + } 166 + 167 + static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) 168 + { 169 + const struct bch_inode_info *inode = data; 170 + 171 + return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed); 172 + } 173 + 160 174 static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg, 161 175 const void *obj) 162 176 { ··· 184 170 .head_offset = offsetof(struct bch_inode_info, hash), 185 171 .key_offset = offsetof(struct bch_inode_info, ei_inum), 186 172 .key_len = sizeof(subvol_inum), 173 + .hashfn = bch2_vfs_inode_hash_fn, 174 + .obj_hashfn = bch2_vfs_inode_obj_hash_fn, 187 175 .obj_cmpfn = bch2_vfs_inode_cmp_fn, 188 176 .automatic_shrinking = true, 189 177 }; 190 178 191 - struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) 179 + int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) 180 + { 181 + struct bch_fs *c = trans->c; 182 + struct rhashtable *ht = &c->vfs_inodes_table; 183 + subvol_inum inum = (subvol_inum) { .inum = p.offset }; 184 + DARRAY(u32) subvols; 185 + int ret = 0; 186 + 187 + if (!test_bit(BCH_FS_started, &c->flags)) 188 + return false; 189 + 190 + darray_init(&subvols); 191 + restart_from_top: 192 + 193 + /* 194 + * Tweaked version of __rhashtable_lookup(); we need to get a list of 195 + * subvolumes in which the given inode number is open. 196 + * 197 + * For this to work, we don't include the subvolume ID in the key that 198 + * we hash - all inodes with the same inode number regardless of 199 + * subvolume will hash to the same slot. 200 + * 201 + * This will be less than ideal if the same file is ever open 202 + * simultaneously in many different snapshots: 203 + */ 204 + rcu_read_lock(); 205 + struct rhash_lock_head __rcu *const *bkt; 206 + struct rhash_head *he; 207 + unsigned int hash; 208 + struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); 209 + restart: 210 + hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params); 211 + bkt = rht_bucket(tbl, hash); 212 + do { 213 + struct bch_inode_info *inode; 214 + 215 + rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { 216 + if (inode->ei_inum.inum == inum.inum) { 217 + ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, 218 + GFP_NOWAIT|__GFP_NOWARN); 219 + if (ret) { 220 + rcu_read_unlock(); 221 + ret = darray_make_room(&subvols, 1); 222 + if (ret) 223 + goto err; 224 + subvols.nr = 0; 225 + goto restart_from_top; 226 + } 227 + } 228 + } 229 + /* An object might have been moved to a different hash chain, 230 + * while we walk along it - better check and retry. 231 + */ 232 + } while (he != RHT_NULLS_MARKER(bkt)); 233 + 234 + /* Ensure we see any new tables. */ 235 + smp_rmb(); 236 + 237 + tbl = rht_dereference_rcu(tbl->future_tbl, ht); 238 + if (unlikely(tbl)) 239 + goto restart; 240 + rcu_read_unlock(); 241 + 242 + darray_for_each(subvols, i) { 243 + u32 snap; 244 + ret = bch2_subvolume_get_snapshot(trans, *i, &snap); 245 + if (ret) 246 + goto err; 247 + 248 + ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot); 249 + if (ret) 250 + break; 251 + } 252 + err: 253 + darray_exit(&subvols); 254 + return ret; 255 + } 256 + 257 + static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) 192 258 { 193 259 return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); 194 260 } ··· 278 184 subvol_inum inum) 279 185 { 280 186 wait_queue_head_t *wq; 281 - DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW); 187 + struct wait_bit_queue_entry wait; 188 + 282 189 wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW); 283 190 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 284 191 spin_unlock(&inode->v.i_lock); ··· 347 252 348 253 set_bit(EI_INODE_HASHED, &inode->ei_flags); 349 254 retry: 350 - if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table, 255 + if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table, 256 + &inode->ei_inum, 351 257 &inode->hash, 352 258 bch2_vfs_inodes_params))) { 353 259 old = bch2_inode_hash_find(c, trans, inode->ei_inum);

+3 -6

fs/bcachefs/fs.h

··· 54 54 return inode->ei_inum; 55 55 } 56 56 57 - struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum); 58 - 59 57 /* 60 58 * Set if we've gotten a btree error for this inode, and thus the vfs inode and 61 59 * btree inode may be inconsistent: ··· 146 148 __bch2_create(struct mnt_idmap *, struct bch_inode_info *, 147 149 struct dentry *, umode_t, dev_t, subvol_inum, unsigned); 148 150 151 + int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p); 152 + 149 153 int bch2_fs_quota_transfer(struct bch_fs *, 150 154 struct bch_inode_info *, 151 155 struct bch_qid, ··· 198 198 199 199 #define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); }) 200 200 201 - static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) 202 - { 203 - return NULL; 204 - } 201 + static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; } 205 202 206 203 static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, 207 204 snapshot_id_list *s) {}

+231 -153

fs/bcachefs/fsck.c

··· 326 326 return ret; 327 327 } 328 328 329 + static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) 330 + { 331 + if (inode->bi_inum == BCACHEFS_ROOT_INO && 332 + inode->bi_subvol == BCACHEFS_ROOT_SUBVOL) 333 + return false; 334 + 335 + return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked); 336 + } 337 + 338 + static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot) 339 + { 340 + struct btree_iter iter; 341 + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents, 342 + SPOS(d_pos.inode, d_pos.offset, snapshot), 343 + BTREE_ITER_intent| 344 + BTREE_ITER_with_updates); 345 + int ret = bkey_err(k); 346 + if (ret) 347 + return ret; 348 + 349 + if (bpos_eq(k.k->p, d_pos)) { 350 + /* 351 + * delet_at() doesn't work because the update path doesn't 352 + * internally use BTREE_ITER_with_updates yet 353 + */ 354 + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); 355 + ret = PTR_ERR_OR_ZERO(k); 356 + if (ret) 357 + goto err; 358 + 359 + bkey_init(&k->k); 360 + k->k.type = KEY_TYPE_whiteout; 361 + k->k.p = iter.pos; 362 + ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node); 363 + } 364 + err: 365 + bch2_trans_iter_exit(trans, &iter); 366 + return ret; 367 + } 368 + 329 369 static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) 330 370 { 331 371 struct bch_fs *c = trans->c; 332 - struct bch_hash_info dir_hash; 333 372 struct bch_inode_unpacked lostfound; 334 373 char name_buf[20]; 335 - struct qstr name; 336 - u64 dir_offset = 0; 337 - u32 dirent_snapshot = inode->bi_snapshot; 338 374 int ret; 339 375 376 + u32 dirent_snapshot = inode->bi_snapshot; 340 377 if (inode->bi_subvol) { 341 378 inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; 342 379 ··· 404 367 if (ret) 405 368 return ret; 406 369 407 - dir_hash = bch2_hash_info_init(c, &lostfound); 370 + struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound); 371 + struct qstr name = (struct qstr) QSTR(name_buf); 408 372 409 - name = (struct qstr) QSTR(name_buf); 373 + inode->bi_dir = lostfound.bi_inum; 410 374 411 375 ret = bch2_dirent_create_snapshot(trans, 412 376 inode->bi_parent_subvol, lostfound.bi_inum, ··· 416 378 inode_d_type(inode), 417 379 &name, 418 380 inode->bi_subvol ?: inode->bi_inum, 419 - &dir_offset, 381 + &inode->bi_dir_offset, 420 382 STR_HASH_must_create); 421 383 if (ret) { 422 384 bch_err_msg(c, ret, "error creating dirent"); 423 385 return ret; 424 386 } 425 387 426 - inode->bi_dir = lostfound.bi_inum; 427 - inode->bi_dir_offset = dir_offset; 388 + ret = __bch2_fsck_write_inode(trans, inode); 389 + if (ret) 390 + return ret; 428 391 429 - return __bch2_fsck_write_inode(trans, inode); 392 + /* 393 + * Fix up inodes in child snapshots: if they should also be reattached 394 + * update the backpointer field, if they should not be we need to emit 395 + * whiteouts for the dirent we just created. 396 + */ 397 + if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) { 398 + snapshot_id_list whiteouts_done; 399 + struct btree_iter iter; 400 + struct bkey_s_c k; 401 + 402 + darray_init(&whiteouts_done); 403 + 404 + for_each_btree_key_reverse_norestart(trans, iter, 405 + BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1), 406 + BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) { 407 + if (k.k->p.offset != inode->bi_inum) 408 + break; 409 + 410 + if (!bkey_is_inode(k.k) || 411 + !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) || 412 + snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot)) 413 + continue; 414 + 415 + struct bch_inode_unpacked child_inode; 416 + bch2_inode_unpack(k, &child_inode); 417 + 418 + if (!inode_should_reattach(&child_inode)) { 419 + ret = maybe_delete_dirent(trans, 420 + SPOS(lostfound.bi_inum, inode->bi_dir_offset, 421 + dirent_snapshot), 422 + k.k->p.snapshot); 423 + if (ret) 424 + break; 425 + 426 + ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot); 427 + if (ret) 428 + break; 429 + } else { 430 + iter.snapshot = k.k->p.snapshot; 431 + child_inode.bi_dir = inode->bi_dir; 432 + child_inode.bi_dir_offset = inode->bi_dir_offset; 433 + 434 + ret = bch2_inode_write_flags(trans, &iter, &child_inode, 435 + BTREE_UPDATE_internal_snapshot_node); 436 + if (ret) 437 + break; 438 + } 439 + } 440 + darray_exit(&whiteouts_done); 441 + bch2_trans_iter_exit(trans, &iter); 442 + } 443 + 444 + return ret; 430 445 } 431 446 432 447 static int remove_backpointer(struct btree_trans *trans, ··· 1085 994 */ 1086 995 inode->bi_dir = 0; 1087 996 inode->bi_dir_offset = 0; 1088 - inode->bi_flags &= ~BCH_INODE_backptr_untrusted; 1089 997 *write_inode = true; 1090 998 } 1091 999 ··· 1096 1006 return ret; 1097 1007 } 1098 1008 1099 - static bool bch2_inode_is_open(struct bch_fs *c, struct bpos p) 1100 - { 1101 - subvol_inum inum = { 1102 - .subvol = snapshot_t(c, p.snapshot)->subvol, 1103 - .inum = p.offset, 1104 - }; 1105 - 1106 - /* snapshot tree corruption, can't safely delete */ 1107 - if (!inum.subvol) { 1108 - bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot); 1109 - return true; 1110 - } 1111 - 1112 - return __bch2_inode_hash_find(c, inum) != NULL; 1113 - } 1114 - 1115 1009 static int check_inode(struct btree_trans *trans, 1116 1010 struct btree_iter *iter, 1117 1011 struct bkey_s_c k, 1118 1012 struct bch_inode_unpacked *prev, 1119 - struct snapshots_seen *s, 1120 - bool full) 1013 + struct snapshots_seen *s) 1121 1014 { 1122 1015 struct bch_fs *c = trans->c; 1123 1016 struct printbuf buf = PRINTBUF; ··· 1122 1049 return 0; 1123 1050 1124 1051 BUG_ON(bch2_inode_unpack(k, &u)); 1125 - 1126 - if (!full && 1127 - !(u.bi_flags & (BCH_INODE_i_size_dirty| 1128 - BCH_INODE_i_sectors_dirty| 1129 - BCH_INODE_unlinked))) 1130 - return 0; 1131 1052 1132 1053 if (prev->bi_inum != u.bi_inum) 1133 1054 *prev = u; ··· 1168 1101 ret = 0; 1169 1102 } 1170 1103 1171 - if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) && 1172 - bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) { 1173 - struct bpos new_min_pos; 1104 + ret = bch2_inode_has_child_snapshots(trans, k.k->p); 1105 + if (ret < 0) 1106 + goto err; 1174 1107 1175 - ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos); 1108 + if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot), 1109 + trans, inode_has_child_snapshots_wrong, 1110 + "inode has_child_snapshots flag wrong (should be %u)\n%s", 1111 + ret, 1112 + (printbuf_reset(&buf), 1113 + bch2_inode_unpacked_to_text(&buf, &u), 1114 + buf.buf))) { 1176 1115 if (ret) 1177 - goto err; 1178 - 1179 - u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked; 1180 - 1181 - ret = __bch2_fsck_write_inode(trans, &u); 1182 - 1183 - bch_err_msg(c, ret, "in fsck updating inode"); 1184 - if (ret) 1185 - goto err_noprint; 1186 - 1187 - if (!bpos_eq(new_min_pos, POS_MIN)) 1188 - bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos)); 1189 - goto err_noprint; 1116 + u.bi_flags |= BCH_INODE_has_child_snapshot; 1117 + else 1118 + u.bi_flags &= ~BCH_INODE_has_child_snapshot; 1119 + do_update = true; 1190 1120 } 1121 + ret = 0; 1191 1122 1192 - if (u.bi_flags & BCH_INODE_unlinked) { 1123 + if ((u.bi_flags & BCH_INODE_unlinked) && 1124 + !(u.bi_flags & BCH_INODE_has_child_snapshot)) { 1193 1125 if (!test_bit(BCH_FS_started, &c->flags)) { 1194 1126 /* 1195 1127 * If we're not in online fsck, don't delete unlinked ··· 1213 1147 if (ret) 1214 1148 goto err; 1215 1149 } else { 1216 - if (fsck_err_on(!bch2_inode_is_open(c, k.k->p), 1150 + ret = bch2_inode_or_descendents_is_open(trans, k.k->p); 1151 + if (ret < 0) 1152 + goto err; 1153 + 1154 + if (fsck_err_on(!ret, 1217 1155 trans, inode_unlinked_and_not_open, 1218 1156 "inode %llu%u unlinked and not open", 1219 1157 u.bi_inum, u.bi_snapshot)) { ··· 1225 1155 bch_err_msg(c, ret, "in fsck deleting inode"); 1226 1156 goto err_noprint; 1227 1157 } 1158 + ret = 0; 1228 1159 } 1229 - } 1230 - 1231 - /* i_size_dirty is vestigal, since we now have logged ops for truncate * */ 1232 - if (u.bi_flags & BCH_INODE_i_size_dirty && 1233 - (!test_bit(BCH_FS_clean_recovery, &c->flags) || 1234 - fsck_err(trans, inode_i_size_dirty_but_clean, 1235 - "filesystem marked clean, but inode %llu has i_size dirty", 1236 - u.bi_inum))) { 1237 - bch_verbose(c, "truncating inode %llu", u.bi_inum); 1238 - 1239 - /* 1240 - * XXX: need to truncate partial blocks too here - or ideally 1241 - * just switch units to bytes and that issue goes away 1242 - */ 1243 - ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, 1244 - SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9, 1245 - iter->pos.snapshot), 1246 - POS(u.bi_inum, U64_MAX), 1247 - 0, NULL); 1248 - bch_err_msg(c, ret, "in fsck truncating inode"); 1249 - if (ret) 1250 - return ret; 1251 - 1252 - /* 1253 - * We truncated without our normal sector accounting hook, just 1254 - * make sure we recalculate it: 1255 - */ 1256 - u.bi_flags |= BCH_INODE_i_sectors_dirty; 1257 - 1258 - u.bi_flags &= ~BCH_INODE_i_size_dirty; 1259 - do_update = true; 1260 - } 1261 - 1262 - /* i_sectors_dirty is vestigal, i_sectors is always updated transactionally */ 1263 - if (u.bi_flags & BCH_INODE_i_sectors_dirty && 1264 - (!test_bit(BCH_FS_clean_recovery, &c->flags) || 1265 - fsck_err(trans, inode_i_sectors_dirty_but_clean, 1266 - "filesystem marked clean, but inode %llu has i_sectors dirty", 1267 - u.bi_inum))) { 1268 - s64 sectors; 1269 - 1270 - bch_verbose(c, "recounting sectors for inode %llu", 1271 - u.bi_inum); 1272 - 1273 - sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); 1274 - if (sectors < 0) { 1275 - bch_err_msg(c, sectors, "in fsck recounting inode sectors"); 1276 - return sectors; 1277 - } 1278 - 1279 - u.bi_sectors = sectors; 1280 - u.bi_flags &= ~BCH_INODE_i_sectors_dirty; 1281 - do_update = true; 1282 - } 1283 - 1284 - if (u.bi_flags & BCH_INODE_backptr_untrusted) { 1285 - u.bi_dir = 0; 1286 - u.bi_dir_offset = 0; 1287 - u.bi_flags &= ~BCH_INODE_backptr_untrusted; 1288 - do_update = true; 1289 1160 } 1290 1161 1291 1162 if (fsck_err_on(u.bi_parent_subvol && ··· 1285 1274 1286 1275 int bch2_check_inodes(struct bch_fs *c) 1287 1276 { 1288 - bool full = c->opts.fsck; 1289 1277 struct bch_inode_unpacked prev = { 0 }; 1290 1278 struct snapshots_seen s; 1291 1279 ··· 1295 1285 POS_MIN, 1296 1286 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 1297 1287 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1298 - check_inode(trans, &iter, k, &prev, &s, full))); 1288 + check_inode(trans, &iter, k, &prev, &s))); 1299 1289 1300 1290 snapshots_seen_exit(&s); 1291 + bch_err_fn(c, ret); 1292 + return ret; 1293 + } 1294 + 1295 + static int find_oldest_inode_needs_reattach(struct btree_trans *trans, 1296 + struct bch_inode_unpacked *inode) 1297 + { 1298 + struct bch_fs *c = trans->c; 1299 + struct btree_iter iter; 1300 + struct bkey_s_c k; 1301 + int ret = 0; 1302 + 1303 + /* 1304 + * We look for inodes to reattach in natural key order, leaves first, 1305 + * but we should do the reattach at the oldest version that needs to be 1306 + * reattached: 1307 + */ 1308 + for_each_btree_key_norestart(trans, iter, 1309 + BTREE_ID_inodes, 1310 + SPOS(0, inode->bi_inum, inode->bi_snapshot + 1), 1311 + BTREE_ITER_all_snapshots, k, ret) { 1312 + if (k.k->p.offset != inode->bi_inum) 1313 + break; 1314 + 1315 + if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot)) 1316 + continue; 1317 + 1318 + if (!bkey_is_inode(k.k)) 1319 + break; 1320 + 1321 + struct bch_inode_unpacked parent_inode; 1322 + bch2_inode_unpack(k, &parent_inode); 1323 + 1324 + if (!inode_should_reattach(&parent_inode)) 1325 + break; 1326 + 1327 + *inode = parent_inode; 1328 + } 1329 + bch2_trans_iter_exit(trans, &iter); 1330 + 1331 + return ret; 1332 + } 1333 + 1334 + static int check_unreachable_inode(struct btree_trans *trans, 1335 + struct btree_iter *iter, 1336 + struct bkey_s_c k) 1337 + { 1338 + struct printbuf buf = PRINTBUF; 1339 + int ret = 0; 1340 + 1341 + if (!bkey_is_inode(k.k)) 1342 + return 0; 1343 + 1344 + struct bch_inode_unpacked inode; 1345 + BUG_ON(bch2_inode_unpack(k, &inode)); 1346 + 1347 + if (!inode_should_reattach(&inode)) 1348 + return 0; 1349 + 1350 + ret = find_oldest_inode_needs_reattach(trans, &inode); 1351 + if (ret) 1352 + return ret; 1353 + 1354 + if (fsck_err(trans, inode_unreachable, 1355 + "unreachable inode:\n%s", 1356 + (bch2_inode_unpacked_to_text(&buf, &inode), 1357 + buf.buf))) 1358 + ret = reattach_inode(trans, &inode); 1359 + fsck_err: 1360 + printbuf_exit(&buf); 1361 + return ret; 1362 + } 1363 + 1364 + /* 1365 + * Reattach unreachable (but not unlinked) inodes 1366 + * 1367 + * Run after check_inodes() and check_dirents(), so we node that inode 1368 + * backpointer fields point to valid dirents, and every inode that has a dirent 1369 + * that points to it has its backpointer field set - so we're just looking for 1370 + * non-unlinked inodes without backpointers: 1371 + * 1372 + * XXX: this is racy w.r.t. hardlink removal in online fsck 1373 + */ 1374 + int bch2_check_unreachable_inodes(struct bch_fs *c) 1375 + { 1376 + int ret = bch2_trans_run(c, 1377 + for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 1378 + POS_MIN, 1379 + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 1380 + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1381 + check_unreachable_inode(trans, &iter, k))); 1301 1382 bch_err_fn(c, ret); 1302 1383 return ret; 1303 1384 } ··· 1795 1694 !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) 1796 1695 continue; 1797 1696 1798 - if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) && 1799 - k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && 1697 + if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && 1800 1698 !bkey_extent_is_reservation(k), 1801 1699 trans, extent_past_end_of_inode, 1802 1700 "extent type past end of inode %llu:%u, i_size %llu\n %s", ··· 2550 2450 if (ret) 2551 2451 break; 2552 2452 2553 - /* 2554 - * We've checked that inode backpointers point to valid dirents; 2555 - * here, it's sufficient to check that the subvolume root has a 2556 - * dirent: 2557 - */ 2558 - if (fsck_err_on(!subvol_root.bi_dir, 2559 - trans, subvol_unreachable, 2560 - "unreachable subvolume %s", 2561 - (bch2_bkey_val_to_text(&buf, c, s.s_c), 2562 - prt_newline(&buf), 2563 - bch2_inode_unpacked_to_text(&buf, &subvol_root), 2564 - buf.buf))) { 2565 - ret = reattach_subvol(trans, s); 2566 - break; 2567 - } 2568 - 2569 2453 u32 parent = le32_to_cpu(s.v->fs_path_parent); 2570 2454 2571 2455 if (darray_u32_has(&subvol_path, parent)) { ··· 2610 2526 return false; 2611 2527 } 2612 2528 2613 - /* 2614 - * Check that a given inode is reachable from its subvolume root - we already 2615 - * verified subvolume connectivity: 2616 - * 2617 - * XXX: we should also be verifying that inodes are in the right subvolumes 2618 - */ 2619 2529 static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k) 2620 2530 { 2621 2531 struct bch_fs *c = trans->c; ··· 2622 2544 p->nr = 0; 2623 2545 2624 2546 BUG_ON(bch2_inode_unpack(inode_k, &inode)); 2547 + 2548 + if (!S_ISDIR(inode.bi_mode)) 2549 + return 0; 2625 2550 2626 2551 while (!inode.bi_subvol) { 2627 2552 struct btree_iter dirent_iter; ··· 2640 2559 bch2_trans_iter_exit(trans, &dirent_iter); 2641 2560 2642 2561 if (bch2_err_matches(ret, ENOENT)) { 2643 - ret = 0; 2644 - if (fsck_err(trans, inode_unreachable, 2645 - "unreachable inode\n%s", 2646 - (printbuf_reset(&buf), 2647 - bch2_bkey_val_to_text(&buf, c, inode_k), 2648 - buf.buf))) 2649 - ret = reattach_inode(trans, &inode); 2562 + printbuf_reset(&buf); 2563 + bch2_bkey_val_to_text(&buf, c, inode_k); 2564 + bch_err(c, "unreachable inode in check_directory_structure: %s\n%s", 2565 + bch2_err_str(ret), buf.buf); 2650 2566 goto out; 2651 2567 } 2652 2568 2653 2569 bch2_trans_iter_exit(trans, &dirent_iter); 2654 - 2655 - if (!S_ISDIR(inode.bi_mode)) 2656 - break; 2657 2570 2658 2571 ret = darray_push(p, ((struct pathbuf_entry) { 2659 2572 .inum = inode.bi_inum, ··· 2701 2626 } 2702 2627 2703 2628 /* 2704 - * Check for unreachable inodes, as well as loops in the directory structure: 2705 - * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's 2706 - * unreachable: 2629 + * Check for loops in the directory structure: all other connectivity issues 2630 + * have been fixed by prior passes 2707 2631 */ 2708 2632 int bch2_check_directory_structure(struct bch_fs *c) 2709 2633 { ··· 2830 2756 if (S_ISDIR(u.bi_mode)) 2831 2757 continue; 2832 2758 2759 + /* 2760 + * Previous passes ensured that bi_nlink is nonzero if 2761 + * it had multiple hardlinks: 2762 + */ 2833 2763 if (!u.bi_nlink) 2834 2764 continue; 2835 2765

+1

fs/bcachefs/fsck.h

··· 9 9 int bch2_check_xattrs(struct bch_fs *); 10 10 int bch2_check_root(struct bch_fs *); 11 11 int bch2_check_subvolume_structure(struct bch_fs *); 12 + int bch2_check_unreachable_inodes(struct bch_fs *); 12 13 int bch2_check_directory_structure(struct bch_fs *); 13 14 int bch2_check_nlinks(struct bch_fs *); 14 15 int bch2_fix_reflink_p(struct bch_fs *);

+242 -33

fs/bcachefs/inode.c

··· 12 12 #include "error.h" 13 13 #include "extents.h" 14 14 #include "extent_update.h" 15 + #include "fs.h" 15 16 #include "inode.h" 16 17 #include "str_hash.h" 17 18 #include "snapshot.h" ··· 34 33 NULL 35 34 }; 36 35 #undef x 36 + 37 + static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos); 37 38 38 39 static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; 39 40 ··· 578 575 } 579 576 } 580 577 581 - static inline bool bkey_is_deleted_inode(struct bkey_s_c k) 578 + static inline void bkey_inode_flags_set(struct bkey_s k, u64 f) 582 579 { 583 - return bkey_inode_flags(k) & BCH_INODE_unlinked; 580 + switch (k.k->type) { 581 + case KEY_TYPE_inode: 582 + bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f); 583 + return; 584 + case KEY_TYPE_inode_v2: 585 + bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f); 586 + return; 587 + case KEY_TYPE_inode_v3: 588 + bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f); 589 + return; 590 + default: 591 + BUG(); 592 + } 593 + } 594 + 595 + static inline bool bkey_is_unlinked_inode(struct bkey_s_c k) 596 + { 597 + unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked; 598 + 599 + return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot); 600 + } 601 + 602 + static struct bkey_s_c 603 + bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, 604 + enum btree_id btree, struct bpos pos, 605 + unsigned flags) 606 + { 607 + struct bch_fs *c = trans->c; 608 + struct bkey_s_c k; 609 + int ret = 0; 610 + 611 + for_each_btree_key_upto_norestart(trans, *iter, btree, 612 + bpos_successor(pos), 613 + SPOS(pos.inode, pos.offset, U32_MAX), 614 + flags|BTREE_ITER_all_snapshots, k, ret) 615 + if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot)) 616 + return k; 617 + 618 + bch2_trans_iter_exit(trans, iter); 619 + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; 620 + } 621 + 622 + static struct bkey_s_c 623 + bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, 624 + struct bpos pos, unsigned flags) 625 + { 626 + struct bkey_s_c k; 627 + again: 628 + k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags); 629 + if (!k.k || 630 + bkey_err(k) || 631 + bkey_is_inode(k.k)) 632 + return k; 633 + 634 + bch2_trans_iter_exit(trans, iter); 635 + pos = k.k->p; 636 + goto again; 637 + } 638 + 639 + int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) 640 + { 641 + struct bch_fs *c = trans->c; 642 + struct btree_iter iter; 643 + struct bkey_s_c k; 644 + int ret = 0; 645 + 646 + for_each_btree_key_upto_norestart(trans, iter, 647 + BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos), 648 + BTREE_ITER_all_snapshots| 649 + BTREE_ITER_with_updates, k, ret) 650 + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) && 651 + bkey_is_inode(k.k)) { 652 + ret = 1; 653 + break; 654 + } 655 + bch2_trans_iter_exit(trans, &iter); 656 + return ret; 657 + } 658 + 659 + static int update_inode_has_children(struct btree_trans *trans, 660 + struct bkey_s k, 661 + bool have_child) 662 + { 663 + if (!have_child) { 664 + int ret = bch2_inode_has_child_snapshots(trans, k.k->p); 665 + if (ret) 666 + return ret < 0 ? ret : 0; 667 + } 668 + 669 + u64 f = bkey_inode_flags(k.s_c); 670 + if (have_child != !!(f & BCH_INODE_has_child_snapshot)) 671 + bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot); 672 + 673 + return 0; 674 + } 675 + 676 + static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos, 677 + bool have_child) 678 + { 679 + struct btree_iter iter; 680 + struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans, 681 + &iter, pos, BTREE_ITER_with_updates); 682 + int ret = bkey_err(k); 683 + if (ret) 684 + return ret; 685 + if (!k.k) 686 + return 0; 687 + 688 + if (!have_child) { 689 + ret = bch2_inode_has_child_snapshots(trans, k.k->p); 690 + if (ret) { 691 + ret = ret < 0 ? ret : 0; 692 + goto err; 693 + } 694 + } 695 + 696 + u64 f = bkey_inode_flags(k); 697 + if (have_child != !!(f & BCH_INODE_has_child_snapshot)) { 698 + struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k, 699 + BTREE_UPDATE_internal_snapshot_node); 700 + ret = PTR_ERR_OR_ZERO(update); 701 + if (ret) 702 + goto err; 703 + 704 + bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot); 705 + } 706 + err: 707 + bch2_trans_iter_exit(trans, &iter); 708 + return ret; 584 709 } 585 710 586 711 int bch2_trigger_inode(struct btree_trans *trans, ··· 717 586 struct bkey_s new, 718 587 enum btree_iter_update_trigger_flags flags) 719 588 { 589 + struct bch_fs *c = trans->c; 590 + 720 591 if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { 721 592 BUG_ON(!trans->journal_res.seq); 722 593 bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); ··· 732 599 return ret; 733 600 } 734 601 735 - int deleted_delta = (int) bkey_is_deleted_inode(new.s_c) - 736 - (int) bkey_is_deleted_inode(old); 737 - if ((flags & BTREE_TRIGGER_transactional) && deleted_delta) { 738 - int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, 739 - new.k->p, deleted_delta > 0); 740 - if (ret) 741 - return ret; 602 + if (flags & BTREE_TRIGGER_transactional) { 603 + int unlinked_delta = (int) bkey_is_unlinked_inode(new.s_c) - 604 + (int) bkey_is_unlinked_inode(old); 605 + if (unlinked_delta) { 606 + int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, 607 + new.k->p, unlinked_delta > 0); 608 + if (ret) 609 + return ret; 610 + } 611 + 612 + /* 613 + * If we're creating or deleting an inode at this snapshot ID, 614 + * and there might be an inode in a parent snapshot ID, we might 615 + * need to set or clear the has_child_snapshot flag on the 616 + * parent. 617 + */ 618 + int deleted_delta = (int) bkey_is_inode(new.k) - 619 + (int) bkey_is_inode(old.k); 620 + if (deleted_delta && 621 + bch2_snapshot_parent(c, new.k->p.snapshot)) { 622 + int ret = update_parent_inode_has_children(trans, new.k->p, 623 + deleted_delta > 0); 624 + if (ret) 625 + return ret; 626 + } 627 + 628 + /* 629 + * When an inode is first updated in a new snapshot, we may need 630 + * to clear has_child_snapshot 631 + */ 632 + if (deleted_delta > 0) { 633 + int ret = update_inode_has_children(trans, new, false); 634 + if (ret) 635 + return ret; 636 + } 742 637 } 743 638 744 639 return 0; ··· 1049 888 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1050 889 goto retry; 1051 890 891 + if (ret) 892 + goto err2; 893 + 894 + ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot)); 895 + err2: 1052 896 bch2_trans_put(trans); 1053 897 return ret; 1054 898 } ··· 1158 992 return 0; 1159 993 } 1160 994 1161 - int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) 995 + static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) 1162 996 { 1163 997 struct bch_fs *c = trans->c; 1164 998 struct btree_iter iter = { NULL }; ··· 1221 1055 return ret ?: -BCH_ERR_transaction_restart_nested; 1222 1056 } 1223 1057 1058 + /* 1059 + * After deleting an inode, there may be versions in older snapshots that should 1060 + * also be deleted - if they're not referenced by sibling snapshots and not open 1061 + * in other subvolumes: 1062 + */ 1063 + static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos) 1064 + { 1065 + struct btree_iter iter; 1066 + struct bkey_s_c k; 1067 + int ret; 1068 + next_parent: 1069 + ret = lockrestart_do(trans, 1070 + bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0))); 1071 + if (ret || !k.k) 1072 + return ret; 1073 + 1074 + bool unlinked = bkey_is_unlinked_inode(k); 1075 + pos = k.k->p; 1076 + bch2_trans_iter_exit(trans, &iter); 1077 + 1078 + if (!unlinked) 1079 + return 0; 1080 + 1081 + ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos)); 1082 + if (ret) 1083 + return ret < 0 ? ret : 0; 1084 + 1085 + ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot); 1086 + if (ret) 1087 + return ret; 1088 + goto next_parent; 1089 + } 1090 + 1091 + int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) 1092 + { 1093 + return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?: 1094 + delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)); 1095 + } 1096 + 1224 1097 static int may_delete_deleted_inode(struct btree_trans *trans, 1225 1098 struct btree_iter *iter, 1226 1099 struct bpos pos, ··· 1269 1064 struct btree_iter inode_iter; 1270 1065 struct bkey_s_c k; 1271 1066 struct bch_inode_unpacked inode; 1067 + struct printbuf buf = PRINTBUF; 1272 1068 int ret; 1273 1069 1274 1070 k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached); ··· 1305 1099 pos.offset, pos.snapshot)) 1306 1100 goto delete; 1307 1101 1102 + if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot, 1103 + trans, deleted_inode_has_child_snapshots, 1104 + "inode with child snapshots %llu:%u in deleted_inodes btree", 1105 + pos.offset, pos.snapshot)) 1106 + goto delete; 1107 + 1108 + ret = bch2_inode_has_child_snapshots(trans, k.k->p); 1109 + if (ret < 0) 1110 + goto out; 1111 + 1112 + if (ret) { 1113 + if (fsck_err(trans, inode_has_child_snapshots_wrong, 1114 + "inode has_child_snapshots flag wrong (should be set)\n%s", 1115 + (printbuf_reset(&buf), 1116 + bch2_inode_unpacked_to_text(&buf, &inode), 1117 + buf.buf))) { 1118 + inode.bi_flags |= BCH_INODE_has_child_snapshot; 1119 + ret = __bch2_fsck_write_inode(trans, &inode); 1120 + if (ret) 1121 + goto out; 1122 + } 1123 + goto delete; 1124 + 1125 + } 1126 + 1308 1127 if (test_bit(BCH_FS_clean_recovery, &c->flags) && 1309 1128 !fsck_err(trans, deleted_inode_but_clean, 1310 1129 "filesystem marked as clean but have deleted inode %llu:%u", ··· 1338 1107 goto out; 1339 1108 } 1340 1109 1341 - if (bch2_snapshot_is_internal_node(c, pos.snapshot)) { 1342 - struct bpos new_min_pos; 1343 - 1344 - ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos); 1345 - if (ret) 1346 - goto out; 1347 - 1348 - inode.bi_flags &= ~BCH_INODE_unlinked; 1349 - 1350 - ret = bch2_inode_write_flags(trans, &inode_iter, &inode, 1351 - BTREE_UPDATE_internal_snapshot_node); 1352 - bch_err_msg(c, ret, "clearing inode unlinked flag"); 1353 - if (ret) 1354 - goto out; 1355 - 1356 - /* 1357 - * We'll need another write buffer flush to pick up the new 1358 - * unlinked inodes in the snapshot leaves: 1359 - */ 1360 - *need_another_pass = true; 1361 - goto out; 1362 - } 1363 - 1364 1110 ret = 1; 1365 1111 out: 1366 1112 fsck_err: 1367 1113 bch2_trans_iter_exit(trans, &inode_iter); 1114 + printbuf_exit(&buf); 1368 1115 return ret; 1369 1116 delete: 1370 1117 ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false);

+10

fs/bcachefs/inode.h

··· 5 5 #include "bkey.h" 6 6 #include "bkey_methods.h" 7 7 #include "opts.h" 8 + #include "snapshot.h" 8 9 9 10 enum bch_validate_flags; 10 11 extern const char * const bch2_inode_opts[]; ··· 17 16 int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c, 18 17 enum bch_validate_flags); 19 18 void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); 19 + 20 + int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos); 21 + 22 + static inline int bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) 23 + { 24 + return bch2_snapshot_is_leaf(trans->c, pos.snapshot) <= 0 25 + ? __bch2_inode_has_child_snapshots(trans, pos) 26 + : 0; 27 + } 20 28 21 29 int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, 22 30 struct bkey_s_c, struct bkey_s,

+2 -1

fs/bcachefs/inode_format.h

··· 133 133 x(i_size_dirty, 5) \ 134 134 x(i_sectors_dirty, 6) \ 135 135 x(unlinked, 7) \ 136 - x(backptr_untrusted, 8) 136 + x(backptr_untrusted, 8) \ 137 + x(has_child_snapshot, 9) 137 138 138 139 /* bits 20+ reserved for packed fields below: */ 139 140

+13

fs/bcachefs/journal.c

··· 603 603 { 604 604 int ret; 605 605 606 + if (closure_wait_event_timeout(&j->async_wait, 607 + (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || 608 + (flags & JOURNAL_RES_GET_NONBLOCK), 609 + HZ * 10)) 610 + return ret; 611 + 612 + struct bch_fs *c = container_of(j, struct bch_fs, journal); 613 + struct printbuf buf = PRINTBUF; 614 + bch2_journal_debug_to_text(&buf, j); 615 + bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s", 616 + buf.buf); 617 + printbuf_exit(&buf); 618 + 606 619 closure_wait_event(&j->async_wait, 607 620 (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || 608 621 (flags & JOURNAL_RES_GET_NONBLOCK));

+3 -1

fs/bcachefs/opts.c

··· 427 427 prt_printf(out, "%lli", v); 428 428 break; 429 429 case BCH_OPT_STR: 430 - if (flags & OPT_SHOW_FULL_LIST) 430 + if (v < opt->min || v >= opt->max - 1) 431 + prt_printf(out, "(invalid option %lli)", v); 432 + else if (flags & OPT_SHOW_FULL_LIST) 431 433 prt_string_option(out, opt->choices, v); 432 434 else 433 435 prt_str(out, opt->choices[v]);

+2 -1

fs/bcachefs/recovery.c

··· 287 287 BCH_TRANS_COMMIT_no_enospc| 288 288 BCH_TRANS_COMMIT_journal_reclaim| 289 289 BCH_TRANS_COMMIT_skip_accounting_apply| 290 - BCH_TRANS_COMMIT_no_journal_res, 290 + BCH_TRANS_COMMIT_no_journal_res| 291 + BCH_WATERMARK_reclaim, 291 292 bch2_journal_replay_accounting_key(trans, k)); 292 293 if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret))) 293 294 goto err;

+1

fs/bcachefs/recovery_passes_types.h

··· 46 46 x(check_dirents, 27, PASS_FSCK) \ 47 47 x(check_xattrs, 28, PASS_FSCK) \ 48 48 x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ 49 + x(check_unreachable_inodes, 40, PASS_ONLINE|PASS_FSCK) \ 49 50 x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ 50 51 x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ 51 52 x(check_nlinks, 31, PASS_FSCK) \

+31 -8

fs/bcachefs/replicas.c

··· 66 66 prt_printf(out, "]"); 67 67 } 68 68 69 - static int bch2_replicas_entry_validate_locked(struct bch_replicas_entry_v1 *r, 70 - struct bch_sb *sb, 71 - struct printbuf *err) 69 + static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r, 70 + struct bch_sb *sb, 71 + struct printbuf *err) 72 72 { 73 73 if (!r->nr_devs) { 74 74 prt_printf(err, "no devices in entry "); ··· 98 98 struct bch_fs *c, 99 99 struct printbuf *err) 100 100 { 101 - mutex_lock(&c->sb_lock); 102 - int ret = bch2_replicas_entry_validate_locked(r, c->disk_sb.sb, err); 103 - mutex_unlock(&c->sb_lock); 104 - return ret; 101 + if (!r->nr_devs) { 102 + prt_printf(err, "no devices in entry "); 103 + goto bad; 104 + } 105 + 106 + if (r->nr_required > 1 && 107 + r->nr_required >= r->nr_devs) { 108 + prt_printf(err, "bad nr_required in entry "); 109 + goto bad; 110 + } 111 + 112 + for (unsigned i = 0; i < r->nr_devs; i++) 113 + if (r->devs[i] != BCH_SB_MEMBER_INVALID && 114 + !bch2_dev_exists(c, r->devs[i])) { 115 + prt_printf(err, "invalid device %u in entry ", r->devs[i]); 116 + goto bad; 117 + } 118 + 119 + return 0; 120 + bad: 121 + bch2_replicas_entry_to_text(err, r); 122 + return -BCH_ERR_invalid_replicas_entry; 105 123 } 106 124 107 125 void bch2_cpu_replicas_to_text(struct printbuf *out, ··· 704 686 struct bch_replicas_entry_v1 *e = 705 687 cpu_replicas_entry(cpu_r, i); 706 688 707 - int ret = bch2_replicas_entry_validate_locked(e, sb, err); 689 + int ret = bch2_replicas_entry_sb_validate(e, sb, err); 708 690 if (ret) 709 691 return ret; 710 692 ··· 821 803 822 804 rcu_read_lock(); 823 805 for (unsigned i = 0; i < e->nr_devs; i++) { 806 + if (e->devs[i] == BCH_SB_MEMBER_INVALID) { 807 + nr_failed++; 808 + continue; 809 + } 810 + 824 811 nr_online += test_bit(e->devs[i], devs.d); 825 812 826 813 struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);

+4 -1

fs/bcachefs/sb-downgrade.c

··· 78 78 BCH_FSCK_ERR_accounting_mismatch) \ 79 79 x(rebalance_work_acct_fix, \ 80 80 BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ 81 - BCH_FSCK_ERR_accounting_mismatch) 81 + BCH_FSCK_ERR_accounting_mismatch) \ 82 + x(inode_has_child_snapshots, \ 83 + BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ 84 + BCH_FSCK_ERR_inode_has_child_snapshots_wrong) 82 85 83 86 #define DOWNGRADE_TABLE() \ 84 87 x(bucket_stripe_sectors, \

+5 -1

fs/bcachefs/sb-errors_format.h

··· 180 180 x(reflink_p_to_missing_reflink_v, 166, 0) \ 181 181 x(stripe_pos_bad, 167, 0) \ 182 182 x(stripe_val_size_bad, 168, 0) \ 183 + x(stripe_csum_granularity_bad, 290, 0) \ 183 184 x(stripe_sector_count_wrong, 169, 0) \ 184 185 x(snapshot_tree_pos_bad, 170, 0) \ 185 186 x(snapshot_tree_to_missing_snapshot, 171, 0) \ ··· 226 225 x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \ 227 226 x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \ 228 227 x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ 228 + x(inode_has_child_snapshots_wrong, 287, 0) \ 229 229 x(inode_unreachable, 210, FSCK_AUTOFIX) \ 230 230 x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ 231 231 x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ 232 232 x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ 233 233 x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \ 234 + x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \ 234 235 x(extent_overlapping, 215, 0) \ 235 236 x(key_in_missing_inode, 216, 0) \ 236 237 x(key_in_wrong_inode_type, 217, 0) \ ··· 292 289 x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \ 293 290 x(accounting_mismatch, 272, FSCK_AUTOFIX) \ 294 291 x(accounting_replicas_not_marked, 273, 0) \ 292 + x(accounting_to_invalid_device, 289, 0) \ 295 293 x(invalid_btree_id, 274, 0) \ 296 294 x(alloc_key_io_time_bad, 275, 0) \ 297 295 x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \ ··· 302 298 x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ 303 299 x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ 304 300 x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ 305 - x(MAX, 287, 0) 301 + x(MAX, 291, 0) 306 302 307 303 enum bch_sb_error_id { 308 304 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,

+9 -1

fs/bcachefs/sb-members.c

··· 163 163 return -BCH_ERR_invalid_sb_members; 164 164 } 165 165 166 + if (m.btree_bitmap_shift >= 64) { 167 + prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift); 168 + return -BCH_ERR_invalid_sb_members; 169 + } 170 + 166 171 return 0; 167 172 } 168 173 ··· 252 247 prt_newline(out); 253 248 254 249 prt_printf(out, "Btree allocated bitmap blocksize:\t"); 255 - prt_units_u64(out, 1ULL << m.btree_bitmap_shift); 250 + if (m.btree_bitmap_shift < 64) 251 + prt_units_u64(out, 1ULL << m.btree_bitmap_shift); 252 + else 253 + prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift); 256 254 prt_newline(out); 257 255 258 256 prt_printf(out, "Btree allocated bitmap:\t");

+30 -99

fs/bcachefs/snapshot.c

··· 905 905 if (bch2_snapshot_equiv(c, id)) 906 906 return 0; 907 907 908 - /* 0 is an invalid tree ID */ 908 + /* Do we need to reconstruct the snapshot_tree entry as well? */ 909 + struct btree_iter iter; 910 + struct bkey_s_c k; 911 + int ret = 0; 909 912 u32 tree_id = 0; 910 - int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); 913 + 914 + for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, 915 + 0, k, ret) { 916 + if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { 917 + tree_id = k.k->p.offset; 918 + break; 919 + } 920 + } 921 + bch2_trans_iter_exit(trans, &iter); 922 + 911 923 if (ret) 912 924 return ret; 925 + 926 + if (!tree_id) { 927 + ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); 928 + if (ret) 929 + return ret; 930 + } 913 931 914 932 struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot)); 915 933 ret = PTR_ERR_OR_ZERO(snapshot); ··· 938 920 snapshot->k.p = POS(0, id); 939 921 snapshot->v.tree = cpu_to_le32(tree_id); 940 922 snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c)); 923 + 924 + for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 925 + 0, k, ret) { 926 + if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { 927 + snapshot->v.subvol = cpu_to_le32(k.k->p.offset); 928 + SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true); 929 + break; 930 + } 931 + } 932 + bch2_trans_iter_exit(trans, &iter); 941 933 942 934 return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: 943 935 bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, ··· 1758 1730 bch2_trans_iter_exit(trans, &iter); 1759 1731 1760 1732 return ret; 1761 - } 1762 - 1763 - static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id) 1764 - { 1765 - const struct snapshot_t *s = snapshot_t(c, id); 1766 - 1767 - return s->children[1] ?: s->children[0]; 1768 - } 1769 - 1770 - static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id) 1771 - { 1772 - u32 child; 1773 - 1774 - while ((child = bch2_snapshot_smallest_child(c, id))) 1775 - id = child; 1776 - return id; 1777 - } 1778 - 1779 - static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans, 1780 - enum btree_id btree, 1781 - struct bkey_s_c interior_k, 1782 - u32 leaf_id, struct bpos *new_min_pos) 1783 - { 1784 - struct btree_iter iter; 1785 - struct bpos pos = interior_k.k->p; 1786 - struct bkey_s_c k; 1787 - struct bkey_i *new; 1788 - int ret; 1789 - 1790 - pos.snapshot = leaf_id; 1791 - 1792 - bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); 1793 - k = bch2_btree_iter_peek_slot(&iter); 1794 - ret = bkey_err(k); 1795 - if (ret) 1796 - goto out; 1797 - 1798 - /* key already overwritten in this snapshot? */ 1799 - if (k.k->p.snapshot != interior_k.k->p.snapshot) 1800 - goto out; 1801 - 1802 - if (bpos_eq(*new_min_pos, POS_MIN)) { 1803 - *new_min_pos = k.k->p; 1804 - new_min_pos->snapshot = leaf_id; 1805 - } 1806 - 1807 - new = bch2_bkey_make_mut_noupdate(trans, interior_k); 1808 - ret = PTR_ERR_OR_ZERO(new); 1809 - if (ret) 1810 - goto out; 1811 - 1812 - new->k.p.snapshot = leaf_id; 1813 - ret = bch2_trans_update(trans, &iter, new, 0); 1814 - out: 1815 - bch2_set_btree_iter_dontneed(&iter); 1816 - bch2_trans_iter_exit(trans, &iter); 1817 - return ret; 1818 - } 1819 - 1820 - int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans, 1821 - enum btree_id btree, 1822 - struct bkey_s_c k, 1823 - struct bpos *new_min_pos) 1824 - { 1825 - struct bch_fs *c = trans->c; 1826 - struct bkey_buf sk; 1827 - u32 restart_count = trans->restart_count; 1828 - int ret = 0; 1829 - 1830 - bch2_bkey_buf_init(&sk); 1831 - bch2_bkey_buf_reassemble(&sk, c, k); 1832 - k = bkey_i_to_s_c(sk.k); 1833 - 1834 - *new_min_pos = POS_MIN; 1835 - 1836 - for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot); 1837 - id < k.k->p.snapshot; 1838 - id++) { 1839 - if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) || 1840 - !bch2_snapshot_is_leaf(c, id)) 1841 - continue; 1842 - again: 1843 - ret = btree_trans_too_many_iters(trans) ?: 1844 - bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?: 1845 - bch2_trans_commit(trans, NULL, NULL, 0); 1846 - if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 1847 - bch2_trans_begin(trans); 1848 - goto again; 1849 - } 1850 - 1851 - if (ret) 1852 - break; 1853 - } 1854 - 1855 - bch2_bkey_buf_exit(&sk, c); 1856 - 1857 - return ret ?: trans_was_restarted(trans, restart_count); 1858 1733 } 1859 1734 1860 1735 static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)

-3

fs/bcachefs/snapshot.h

··· 259 259 return __bch2_key_has_snapshot_overwrites(trans, id, pos); 260 260 } 261 261 262 - int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id, 263 - struct bkey_s_c, struct bpos *); 264 - 265 262 int bch2_snapshots_read(struct bch_fs *); 266 263 void bch2_fs_snapshots_exit(struct bch_fs *); 267 264

+24 -10

fs/bcachefs/super.c

··· 184 184 185 185 DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait); 186 186 187 + static void bch2_dev_unlink(struct bch_dev *); 187 188 static void bch2_dev_free(struct bch_dev *); 188 189 static int bch2_dev_alloc(struct bch_fs *, unsigned); 189 190 static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); ··· 621 620 up_write(&c->state_lock); 622 621 623 622 for_each_member_device(c, ca) 624 - if (ca->kobj.state_in_sysfs && 625 - ca->disk_sb.bdev) 626 - sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); 623 + bch2_dev_unlink(ca); 627 624 628 625 if (c->kobj.state_in_sysfs) 629 626 kobject_del(&c->kobj); ··· 1186 1187 { 1187 1188 cancel_work_sync(&ca->io_error_work); 1188 1189 1189 - if (ca->kobj.state_in_sysfs && 1190 - ca->disk_sb.bdev) 1191 - sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); 1190 + bch2_dev_unlink(ca); 1192 1191 1193 1192 if (ca->kobj.state_in_sysfs) 1194 1193 kobject_del(&ca->kobj); ··· 1223 1226 percpu_ref_kill(&ca->io_ref); 1224 1227 wait_for_completion(&ca->io_ref_completion); 1225 1228 1226 - if (ca->kobj.state_in_sysfs) { 1227 - sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); 1228 - sysfs_remove_link(&ca->kobj, "block"); 1229 - } 1229 + bch2_dev_unlink(ca); 1230 1230 1231 1231 bch2_free_super(&ca->disk_sb); 1232 1232 bch2_dev_journal_exit(ca); ··· 1243 1249 struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); 1244 1250 1245 1251 complete(&ca->io_ref_completion); 1252 + } 1253 + 1254 + static void bch2_dev_unlink(struct bch_dev *ca) 1255 + { 1256 + struct kobject *b; 1257 + 1258 + /* 1259 + * This is racy w.r.t. the underlying block device being hot-removed, 1260 + * which removes it from sysfs. 1261 + * 1262 + * It'd be lovely if we had a way to handle this race, but the sysfs 1263 + * code doesn't appear to provide a good method and block/holder.c is 1264 + * susceptible as well: 1265 + */ 1266 + if (ca->kobj.state_in_sysfs && 1267 + ca->disk_sb.bdev && 1268 + (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) { 1269 + sysfs_remove_link(b, "bcachefs"); 1270 + sysfs_remove_link(&ca->kobj, "block"); 1271 + } 1246 1272 } 1247 1273 1248 1274 static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)

+35

include/linux/closure.h

··· 454 454 __closure_wait_event(waitlist, _cond); \ 455 455 } while (0) 456 456 457 + #define __closure_wait_event_timeout(waitlist, _cond, _until) \ 458 + ({ \ 459 + struct closure cl; \ 460 + long _t; \ 461 + \ 462 + closure_init_stack(&cl); \ 463 + \ 464 + while (1) { \ 465 + closure_wait(waitlist, &cl); \ 466 + if (_cond) { \ 467 + _t = max_t(long, 1L, _until - jiffies); \ 468 + break; \ 469 + } \ 470 + _t = max_t(long, 0L, _until - jiffies); \ 471 + if (!_t) \ 472 + break; \ 473 + closure_sync_timeout(&cl, _t); \ 474 + } \ 475 + closure_wake_up(waitlist); \ 476 + closure_sync(&cl); \ 477 + _t; \ 478 + }) 479 + 480 + /* 481 + * Returns 0 if timeout expired, remaining time in jiffies (at least 1) if 482 + * condition became true 483 + */ 484 + #define closure_wait_event_timeout(waitlist, _cond, _timeout) \ 485 + ({ \ 486 + unsigned long _until = jiffies + _timeout; \ 487 + (_cond) \ 488 + ? max_t(long, 1L, _until - jiffies) \ 489 + : __closure_wait_event_timeout(waitlist, _cond, _until);\ 490 + }) 491 + 457 492 #endif /* _LINUX_CLOSURE_H */