Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v6.15-rc5 2283 lines 56 kB view raw
1// SPDX-License-Identifier: GPL-2.0 2/* 3 * bcachefs setup/teardown code, and some metadata io - read a superblock and 4 * figure out what to do with it. 5 * 6 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 7 * Copyright 2012 Google, Inc. 8 */ 9 10#include "bcachefs.h" 11#include "alloc_background.h" 12#include "alloc_foreground.h" 13#include "bkey_sort.h" 14#include "btree_cache.h" 15#include "btree_gc.h" 16#include "btree_journal_iter.h" 17#include "btree_key_cache.h" 18#include "btree_node_scan.h" 19#include "btree_update_interior.h" 20#include "btree_io.h" 21#include "btree_write_buffer.h" 22#include "buckets_waiting_for_journal.h" 23#include "chardev.h" 24#include "checksum.h" 25#include "clock.h" 26#include "compress.h" 27#include "debug.h" 28#include "disk_accounting.h" 29#include "disk_groups.h" 30#include "ec.h" 31#include "errcode.h" 32#include "error.h" 33#include "fs.h" 34#include "fs-io.h" 35#include "fs-io-buffered.h" 36#include "fs-io-direct.h" 37#include "fsck.h" 38#include "inode.h" 39#include "io_read.h" 40#include "io_write.h" 41#include "journal.h" 42#include "journal_reclaim.h" 43#include "journal_seq_blacklist.h" 44#include "move.h" 45#include "migrate.h" 46#include "movinggc.h" 47#include "nocow_locking.h" 48#include "quota.h" 49#include "rebalance.h" 50#include "recovery.h" 51#include "replicas.h" 52#include "sb-clean.h" 53#include "sb-counters.h" 54#include "sb-errors.h" 55#include "sb-members.h" 56#include "snapshot.h" 57#include "subvolume.h" 58#include "super.h" 59#include "super-io.h" 60#include "sysfs.h" 61#include "thread_with_file.h" 62#include "trace.h" 63 64#include <linux/backing-dev.h> 65#include <linux/blkdev.h> 66#include <linux/debugfs.h> 67#include <linux/device.h> 68#include <linux/idr.h> 69#include <linux/module.h> 70#include <linux/percpu.h> 71#include <linux/random.h> 72#include <linux/sysfs.h> 73 74MODULE_LICENSE("GPL"); 75MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); 76MODULE_DESCRIPTION("bcachefs filesystem"); 77 78const char * const bch2_fs_flag_strs[] = { 79#define x(n) #n, 80 BCH_FS_FLAGS() 81#undef x 82 NULL 83}; 84 85void bch2_print_str(struct bch_fs *c, const char *str) 86{ 87#ifdef __KERNEL__ 88 struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); 89 90 if (unlikely(stdio)) { 91 bch2_stdio_redirect_printf(stdio, true, "%s", str); 92 return; 93 } 94#endif 95 bch2_print_string_as_lines(KERN_ERR, str); 96} 97 98__printf(2, 0) 99static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args) 100{ 101#ifdef __KERNEL__ 102 if (unlikely(stdio)) { 103 if (fmt[0] == KERN_SOH[0]) 104 fmt += 2; 105 106 bch2_stdio_redirect_vprintf(stdio, true, fmt, args); 107 return; 108 } 109#endif 110 vprintk(fmt, args); 111} 112 113void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...) 114{ 115 struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio; 116 117 va_list args; 118 va_start(args, fmt); 119 bch2_print_maybe_redirect(stdio, fmt, args); 120 va_end(args); 121} 122 123void __bch2_print(struct bch_fs *c, const char *fmt, ...) 124{ 125 struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); 126 127 va_list args; 128 va_start(args, fmt); 129 bch2_print_maybe_redirect(stdio, fmt, args); 130 va_end(args); 131} 132 133#define KTYPE(type) \ 134static const struct attribute_group type ## _group = { \ 135 .attrs = type ## _files \ 136}; \ 137 \ 138static const struct attribute_group *type ## _groups[] = { \ 139 &type ## _group, \ 140 NULL \ 141}; \ 142 \ 143static const struct kobj_type type ## _ktype = { \ 144 .release = type ## _release, \ 145 .sysfs_ops = &type ## _sysfs_ops, \ 146 .default_groups = type ## _groups \ 147} 148 149static void bch2_fs_release(struct kobject *); 150static void bch2_dev_release(struct kobject *); 151static void bch2_fs_counters_release(struct kobject *k) 152{ 153} 154 155static void bch2_fs_internal_release(struct kobject *k) 156{ 157} 158 159static void bch2_fs_opts_dir_release(struct kobject *k) 160{ 161} 162 163static void bch2_fs_time_stats_release(struct kobject *k) 164{ 165} 166 167KTYPE(bch2_fs); 168KTYPE(bch2_fs_counters); 169KTYPE(bch2_fs_internal); 170KTYPE(bch2_fs_opts_dir); 171KTYPE(bch2_fs_time_stats); 172KTYPE(bch2_dev); 173 174static struct kset *bcachefs_kset; 175static LIST_HEAD(bch_fs_list); 176static DEFINE_MUTEX(bch_fs_list_lock); 177 178DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait); 179 180static void bch2_dev_unlink(struct bch_dev *); 181static void bch2_dev_free(struct bch_dev *); 182static int bch2_dev_alloc(struct bch_fs *, unsigned); 183static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); 184static void bch2_dev_io_ref_stop(struct bch_dev *, int); 185static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); 186 187struct bch_fs *bch2_dev_to_fs(dev_t dev) 188{ 189 struct bch_fs *c; 190 191 mutex_lock(&bch_fs_list_lock); 192 rcu_read_lock(); 193 194 list_for_each_entry(c, &bch_fs_list, list) 195 for_each_member_device_rcu(c, ca, NULL) 196 if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { 197 closure_get(&c->cl); 198 goto found; 199 } 200 c = NULL; 201found: 202 rcu_read_unlock(); 203 mutex_unlock(&bch_fs_list_lock); 204 205 return c; 206} 207 208static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid) 209{ 210 struct bch_fs *c; 211 212 lockdep_assert_held(&bch_fs_list_lock); 213 214 list_for_each_entry(c, &bch_fs_list, list) 215 if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid))) 216 return c; 217 218 return NULL; 219} 220 221struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) 222{ 223 struct bch_fs *c; 224 225 mutex_lock(&bch_fs_list_lock); 226 c = __bch2_uuid_to_fs(uuid); 227 if (c) 228 closure_get(&c->cl); 229 mutex_unlock(&bch_fs_list_lock); 230 231 return c; 232} 233 234/* Filesystem RO/RW: */ 235 236/* 237 * For startup/shutdown of RW stuff, the dependencies are: 238 * 239 * - foreground writes depend on copygc and rebalance (to free up space) 240 * 241 * - copygc and rebalance depend on mark and sweep gc (they actually probably 242 * don't because they either reserve ahead of time or don't block if 243 * allocations fail, but allocations can require mark and sweep gc to run 244 * because of generation number wraparound) 245 * 246 * - all of the above depends on the allocator threads 247 * 248 * - allocator depends on the journal (when it rewrites prios and gens) 249 */ 250 251static void __bch2_fs_read_only(struct bch_fs *c) 252{ 253 unsigned clean_passes = 0; 254 u64 seq = 0; 255 256 bch2_fs_ec_stop(c); 257 bch2_open_buckets_stop(c, NULL, true); 258 bch2_rebalance_stop(c); 259 bch2_copygc_stop(c); 260 bch2_fs_ec_flush(c); 261 262 bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu", 263 journal_cur_seq(&c->journal)); 264 265 do { 266 clean_passes++; 267 268 if (bch2_btree_interior_updates_flush(c) || 269 bch2_btree_write_buffer_flush_going_ro(c) || 270 bch2_journal_flush_all_pins(&c->journal) || 271 bch2_btree_flush_all_writes(c) || 272 seq != atomic64_read(&c->journal.seq)) { 273 seq = atomic64_read(&c->journal.seq); 274 clean_passes = 0; 275 } 276 } while (clean_passes < 2); 277 278 bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu", 279 journal_cur_seq(&c->journal)); 280 281 if (test_bit(JOURNAL_replay_done, &c->journal.flags) && 282 !test_bit(BCH_FS_emergency_ro, &c->flags)) 283 set_bit(BCH_FS_clean_shutdown, &c->flags); 284 285 bch2_fs_journal_stop(&c->journal); 286 287 bch_info(c, "%sclean shutdown complete, journal seq %llu", 288 test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un", 289 c->journal.seq_ondisk); 290 291 /* 292 * After stopping journal: 293 */ 294 for_each_member_device(c, ca) { 295 bch2_dev_io_ref_stop(ca, WRITE); 296 bch2_dev_allocator_remove(c, ca); 297 } 298} 299 300#ifndef BCH_WRITE_REF_DEBUG 301static void bch2_writes_disabled(struct percpu_ref *writes) 302{ 303 struct bch_fs *c = container_of(writes, struct bch_fs, writes); 304 305 set_bit(BCH_FS_write_disable_complete, &c->flags); 306 wake_up(&bch2_read_only_wait); 307} 308#endif 309 310void bch2_fs_read_only(struct bch_fs *c) 311{ 312 if (!test_bit(BCH_FS_rw, &c->flags)) { 313 bch2_journal_reclaim_stop(&c->journal); 314 return; 315 } 316 317 BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags)); 318 319 bch_verbose(c, "going read-only"); 320 321 /* 322 * Block new foreground-end write operations from starting - any new 323 * writes will return -EROFS: 324 */ 325 set_bit(BCH_FS_going_ro, &c->flags); 326#ifndef BCH_WRITE_REF_DEBUG 327 percpu_ref_kill(&c->writes); 328#else 329 for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) 330 bch2_write_ref_put(c, i); 331#endif 332 333 /* 334 * If we're not doing an emergency shutdown, we want to wait on 335 * outstanding writes to complete so they don't see spurious errors due 336 * to shutting down the allocator: 337 * 338 * If we are doing an emergency shutdown outstanding writes may 339 * hang until we shutdown the allocator so we don't want to wait 340 * on outstanding writes before shutting everything down - but 341 * we do need to wait on them before returning and signalling 342 * that going RO is complete: 343 */ 344 wait_event(bch2_read_only_wait, 345 test_bit(BCH_FS_write_disable_complete, &c->flags) || 346 test_bit(BCH_FS_emergency_ro, &c->flags)); 347 348 bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags); 349 if (writes_disabled) 350 bch_verbose(c, "finished waiting for writes to stop"); 351 352 __bch2_fs_read_only(c); 353 354 wait_event(bch2_read_only_wait, 355 test_bit(BCH_FS_write_disable_complete, &c->flags)); 356 357 if (!writes_disabled) 358 bch_verbose(c, "finished waiting for writes to stop"); 359 360 clear_bit(BCH_FS_write_disable_complete, &c->flags); 361 clear_bit(BCH_FS_going_ro, &c->flags); 362 clear_bit(BCH_FS_rw, &c->flags); 363 364 if (!bch2_journal_error(&c->journal) && 365 !test_bit(BCH_FS_error, &c->flags) && 366 !test_bit(BCH_FS_emergency_ro, &c->flags) && 367 test_bit(BCH_FS_started, &c->flags) && 368 test_bit(BCH_FS_clean_shutdown, &c->flags) && 369 c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) { 370 BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); 371 BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); 372 BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); 373 BUG_ON(c->btree_write_buffer.inc.keys.nr); 374 BUG_ON(c->btree_write_buffer.flushing.keys.nr); 375 bch2_verify_accounting_clean(c); 376 377 bch_verbose(c, "marking filesystem clean"); 378 bch2_fs_mark_clean(c); 379 } else { 380 bch_verbose(c, "done going read-only, filesystem not clean"); 381 } 382} 383 384static void bch2_fs_read_only_work(struct work_struct *work) 385{ 386 struct bch_fs *c = 387 container_of(work, struct bch_fs, read_only_work); 388 389 down_write(&c->state_lock); 390 bch2_fs_read_only(c); 391 up_write(&c->state_lock); 392} 393 394static void bch2_fs_read_only_async(struct bch_fs *c) 395{ 396 queue_work(system_long_wq, &c->read_only_work); 397} 398 399bool bch2_fs_emergency_read_only(struct bch_fs *c) 400{ 401 bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); 402 403 bch2_journal_halt(&c->journal); 404 bch2_fs_read_only_async(c); 405 406 wake_up(&bch2_read_only_wait); 407 return ret; 408} 409 410bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) 411{ 412 bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); 413 414 bch2_journal_halt_locked(&c->journal); 415 bch2_fs_read_only_async(c); 416 417 wake_up(&bch2_read_only_wait); 418 return ret; 419} 420 421static int __bch2_fs_read_write(struct bch_fs *c, bool early) 422{ 423 int ret; 424 425 BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); 426 427 if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { 428 bch_err(c, "cannot go rw, unfixed btree errors"); 429 return -BCH_ERR_erofs_unfixed_errors; 430 } 431 432 if (test_bit(BCH_FS_rw, &c->flags)) 433 return 0; 434 435 bch_info(c, "going read-write"); 436 437 ret = bch2_sb_members_v2_init(c); 438 if (ret) 439 goto err; 440 441 clear_bit(BCH_FS_clean_shutdown, &c->flags); 442 443 __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) { 444 bch2_dev_allocator_add(c, ca); 445 percpu_ref_reinit(&ca->io_ref[WRITE]); 446 } 447 bch2_recalc_capacity(c); 448 449 /* 450 * First journal write must be a flush write: after a clean shutdown we 451 * don't read the journal, so the first journal write may end up 452 * overwriting whatever was there previously, and there must always be 453 * at least one non-flush write in the journal or recovery will fail: 454 */ 455 spin_lock(&c->journal.lock); 456 set_bit(JOURNAL_need_flush_write, &c->journal.flags); 457 set_bit(JOURNAL_running, &c->journal.flags); 458 bch2_journal_space_available(&c->journal); 459 spin_unlock(&c->journal.lock); 460 461 ret = bch2_fs_mark_dirty(c); 462 if (ret) 463 goto err; 464 465 ret = bch2_journal_reclaim_start(&c->journal); 466 if (ret) 467 goto err; 468 469 set_bit(BCH_FS_rw, &c->flags); 470 set_bit(BCH_FS_was_rw, &c->flags); 471 472#ifndef BCH_WRITE_REF_DEBUG 473 percpu_ref_reinit(&c->writes); 474#else 475 for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) { 476 BUG_ON(atomic_long_read(&c->writes[i])); 477 atomic_long_inc(&c->writes[i]); 478 } 479#endif 480 481 ret = bch2_copygc_start(c); 482 if (ret) { 483 bch_err_msg(c, ret, "error starting copygc thread"); 484 goto err; 485 } 486 487 ret = bch2_rebalance_start(c); 488 if (ret) { 489 bch_err_msg(c, ret, "error starting rebalance thread"); 490 goto err; 491 } 492 493 bch2_do_discards(c); 494 bch2_do_invalidates(c); 495 bch2_do_stripe_deletes(c); 496 bch2_do_pending_node_rewrites(c); 497 return 0; 498err: 499 if (test_bit(BCH_FS_rw, &c->flags)) 500 bch2_fs_read_only(c); 501 else 502 __bch2_fs_read_only(c); 503 return ret; 504} 505 506int bch2_fs_read_write(struct bch_fs *c) 507{ 508 if (c->opts.recovery_pass_last && 509 c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay) 510 return -BCH_ERR_erofs_norecovery; 511 512 if (c->opts.nochanges) 513 return -BCH_ERR_erofs_nochanges; 514 515 return __bch2_fs_read_write(c, false); 516} 517 518int bch2_fs_read_write_early(struct bch_fs *c) 519{ 520 down_write(&c->state_lock); 521 int ret = __bch2_fs_read_write(c, true); 522 up_write(&c->state_lock); 523 524 return ret; 525} 526 527/* Filesystem startup/shutdown: */ 528 529static void __bch2_fs_free(struct bch_fs *c) 530{ 531 for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++) 532 bch2_time_stats_exit(&c->times[i]); 533 534#ifdef CONFIG_UNICODE 535 utf8_unload(c->cf_encoding); 536#endif 537 538 bch2_find_btree_nodes_exit(&c->found_btree_nodes); 539 bch2_free_pending_node_rewrites(c); 540 bch2_free_fsck_errs(c); 541 bch2_fs_accounting_exit(c); 542 bch2_fs_sb_errors_exit(c); 543 bch2_fs_counters_exit(c); 544 bch2_fs_snapshots_exit(c); 545 bch2_fs_quota_exit(c); 546 bch2_fs_fs_io_direct_exit(c); 547 bch2_fs_fs_io_buffered_exit(c); 548 bch2_fs_fsio_exit(c); 549 bch2_fs_vfs_exit(c); 550 bch2_fs_ec_exit(c); 551 bch2_fs_encryption_exit(c); 552 bch2_fs_nocow_locking_exit(c); 553 bch2_fs_io_write_exit(c); 554 bch2_fs_io_read_exit(c); 555 bch2_fs_buckets_waiting_for_journal_exit(c); 556 bch2_fs_btree_interior_update_exit(c); 557 bch2_fs_btree_key_cache_exit(&c->btree_key_cache); 558 bch2_fs_btree_cache_exit(c); 559 bch2_fs_btree_iter_exit(c); 560 bch2_fs_replicas_exit(c); 561 bch2_fs_journal_exit(&c->journal); 562 bch2_io_clock_exit(&c->io_clock[WRITE]); 563 bch2_io_clock_exit(&c->io_clock[READ]); 564 bch2_fs_compress_exit(c); 565 bch2_fs_btree_gc_exit(c); 566 bch2_journal_keys_put_initial(c); 567 bch2_find_btree_nodes_exit(&c->found_btree_nodes); 568 BUG_ON(atomic_read(&c->journal_keys.ref)); 569 bch2_fs_btree_write_buffer_exit(c); 570 percpu_free_rwsem(&c->mark_lock); 571 if (c->online_reserved) { 572 u64 v = percpu_u64_get(c->online_reserved); 573 WARN(v, "online_reserved not 0 at shutdown: %lli", v); 574 free_percpu(c->online_reserved); 575 } 576 577 darray_exit(&c->incompat_versions_requested); 578 darray_exit(&c->btree_roots_extra); 579 free_percpu(c->pcpu); 580 free_percpu(c->usage); 581 mempool_exit(&c->large_bkey_pool); 582 mempool_exit(&c->btree_bounce_pool); 583 bioset_exit(&c->btree_bio); 584 mempool_exit(&c->fill_iter); 585#ifndef BCH_WRITE_REF_DEBUG 586 percpu_ref_exit(&c->writes); 587#endif 588 kfree(rcu_dereference_protected(c->disk_groups, 1)); 589 kfree(c->journal_seq_blacklist_table); 590 591 if (c->write_ref_wq) 592 destroy_workqueue(c->write_ref_wq); 593 if (c->btree_write_submit_wq) 594 destroy_workqueue(c->btree_write_submit_wq); 595 if (c->btree_read_complete_wq) 596 destroy_workqueue(c->btree_read_complete_wq); 597 if (c->copygc_wq) 598 destroy_workqueue(c->copygc_wq); 599 if (c->btree_io_complete_wq) 600 destroy_workqueue(c->btree_io_complete_wq); 601 if (c->btree_update_wq) 602 destroy_workqueue(c->btree_update_wq); 603 604 bch2_free_super(&c->disk_sb); 605 kvfree(c); 606 module_put(THIS_MODULE); 607} 608 609static void bch2_fs_release(struct kobject *kobj) 610{ 611 struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); 612 613 __bch2_fs_free(c); 614} 615 616void __bch2_fs_stop(struct bch_fs *c) 617{ 618 bch_verbose(c, "shutting down"); 619 620 set_bit(BCH_FS_stopping, &c->flags); 621 622 down_write(&c->state_lock); 623 bch2_fs_read_only(c); 624 up_write(&c->state_lock); 625 626 for_each_member_device(c, ca) 627 bch2_dev_unlink(ca); 628 629 if (c->kobj.state_in_sysfs) 630 kobject_del(&c->kobj); 631 632 bch2_fs_debug_exit(c); 633 bch2_fs_chardev_exit(c); 634 635 bch2_ro_ref_put(c); 636 wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref)); 637 638 kobject_put(&c->counters_kobj); 639 kobject_put(&c->time_stats); 640 kobject_put(&c->opts_dir); 641 kobject_put(&c->internal); 642 643 /* btree prefetch might have kicked off reads in the background: */ 644 bch2_btree_flush_all_reads(c); 645 646 for_each_member_device(c, ca) 647 cancel_work_sync(&ca->io_error_work); 648 649 cancel_work_sync(&c->read_only_work); 650} 651 652void bch2_fs_free(struct bch_fs *c) 653{ 654 unsigned i; 655 656 mutex_lock(&bch_fs_list_lock); 657 list_del(&c->list); 658 mutex_unlock(&bch_fs_list_lock); 659 660 closure_sync(&c->cl); 661 closure_debug_destroy(&c->cl); 662 663 for (i = 0; i < c->sb.nr_devices; i++) { 664 struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); 665 666 if (ca) { 667 EBUG_ON(atomic_long_read(&ca->ref) != 1); 668 bch2_dev_io_ref_stop(ca, READ); 669 bch2_free_super(&ca->disk_sb); 670 bch2_dev_free(ca); 671 } 672 } 673 674 bch_verbose(c, "shutdown complete"); 675 676 kobject_put(&c->kobj); 677} 678 679void bch2_fs_stop(struct bch_fs *c) 680{ 681 __bch2_fs_stop(c); 682 bch2_fs_free(c); 683} 684 685static int bch2_fs_online(struct bch_fs *c) 686{ 687 int ret = 0; 688 689 lockdep_assert_held(&bch_fs_list_lock); 690 691 if (__bch2_uuid_to_fs(c->sb.uuid)) { 692 bch_err(c, "filesystem UUID already open"); 693 return -EINVAL; 694 } 695 696 ret = bch2_fs_chardev_init(c); 697 if (ret) { 698 bch_err(c, "error creating character device"); 699 return ret; 700 } 701 702 bch2_fs_debug_init(c); 703 704 ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: 705 kobject_add(&c->internal, &c->kobj, "internal") ?: 706 kobject_add(&c->opts_dir, &c->kobj, "options") ?: 707#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 708 kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: 709#endif 710 kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: 711 bch2_opts_create_sysfs_files(&c->opts_dir, OPT_FS); 712 if (ret) { 713 bch_err(c, "error creating sysfs objects"); 714 return ret; 715 } 716 717 down_write(&c->state_lock); 718 719 for_each_member_device(c, ca) { 720 ret = bch2_dev_sysfs_online(c, ca); 721 if (ret) { 722 bch_err(c, "error creating sysfs objects"); 723 bch2_dev_put(ca); 724 goto err; 725 } 726 } 727 728 BUG_ON(!list_empty(&c->list)); 729 list_add(&c->list, &bch_fs_list); 730err: 731 up_write(&c->state_lock); 732 return ret; 733} 734 735static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) 736{ 737 struct bch_fs *c; 738 struct printbuf name = PRINTBUF; 739 unsigned i, iter_size; 740 int ret = 0; 741 742 c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); 743 if (!c) { 744 c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc); 745 goto out; 746 } 747 748 c->stdio = (void *)(unsigned long) opts.stdio; 749 750 __module_get(THIS_MODULE); 751 752 closure_init(&c->cl, NULL); 753 754 c->kobj.kset = bcachefs_kset; 755 kobject_init(&c->kobj, &bch2_fs_ktype); 756 kobject_init(&c->internal, &bch2_fs_internal_ktype); 757 kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); 758 kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); 759 kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype); 760 761 c->minor = -1; 762 c->disk_sb.fs_sb = true; 763 764 init_rwsem(&c->state_lock); 765 mutex_init(&c->sb_lock); 766 mutex_init(&c->replicas_gc_lock); 767 mutex_init(&c->btree_root_lock); 768 INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); 769 770 refcount_set(&c->ro_ref, 1); 771 init_waitqueue_head(&c->ro_ref_wait); 772 spin_lock_init(&c->recovery_pass_lock); 773 sema_init(&c->online_fsck_mutex, 1); 774 775 for (i = 0; i < BCH_TIME_STAT_NR; i++) 776 bch2_time_stats_init(&c->times[i]); 777 778 bch2_fs_copygc_init(c); 779 bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); 780 bch2_fs_btree_iter_init_early(c); 781 bch2_fs_btree_interior_update_init_early(c); 782 bch2_fs_journal_keys_init(c); 783 bch2_fs_allocator_background_init(c); 784 bch2_fs_allocator_foreground_init(c); 785 bch2_fs_rebalance_init(c); 786 bch2_fs_quota_init(c); 787 bch2_fs_ec_init_early(c); 788 bch2_fs_move_init(c); 789 bch2_fs_sb_errors_init_early(c); 790 791 INIT_LIST_HEAD(&c->list); 792 793 mutex_init(&c->bio_bounce_pages_lock); 794 mutex_init(&c->snapshot_table_lock); 795 init_rwsem(&c->snapshot_create_lock); 796 797 spin_lock_init(&c->btree_write_error_lock); 798 799 INIT_LIST_HEAD(&c->journal_iters); 800 801 INIT_LIST_HEAD(&c->fsck_error_msgs); 802 mutex_init(&c->fsck_error_msgs_lock); 803 804 seqcount_init(&c->usage_lock); 805 806 sema_init(&c->io_in_flight, 128); 807 808 INIT_LIST_HEAD(&c->vfs_inodes_list); 809 mutex_init(&c->vfs_inodes_lock); 810 811 c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; 812 c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; 813 c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; 814 815 bch2_fs_btree_cache_init_early(&c->btree_cache); 816 817 mutex_init(&c->sectors_available_lock); 818 819 ret = percpu_init_rwsem(&c->mark_lock); 820 if (ret) 821 goto err; 822 823 mutex_lock(&c->sb_lock); 824 ret = bch2_sb_to_fs(c, sb); 825 mutex_unlock(&c->sb_lock); 826 827 if (ret) 828 goto err; 829 830 pr_uuid(&name, c->sb.user_uuid.b); 831 ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; 832 if (ret) 833 goto err; 834 835 strscpy(c->name, name.buf, sizeof(c->name)); 836 printbuf_exit(&name); 837 838 /* Compat: */ 839 if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && 840 !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) 841 SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); 842 843 if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && 844 !BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) 845 SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100); 846 847 c->opts = bch2_opts_default; 848 ret = bch2_opts_from_sb(&c->opts, sb); 849 if (ret) 850 goto err; 851 852 bch2_opts_apply(&c->opts, opts); 853 854 c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; 855 if (c->opts.inodes_use_key_cache) 856 c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; 857 c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops; 858 859 c->block_bits = ilog2(block_sectors(c)); 860 c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); 861 862 if (bch2_fs_init_fault("fs_alloc")) { 863 bch_err(c, "fs_alloc fault injected"); 864 ret = -EFAULT; 865 goto err; 866 } 867 868 iter_size = sizeof(struct sort_iter) + 869 (btree_blocks(c) + 1) * 2 * 870 sizeof(struct sort_iter_set); 871 872 if (!(c->btree_update_wq = alloc_workqueue("bcachefs", 873 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || 874 !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", 875 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || 876 !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", 877 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || 878 !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", 879 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || 880 !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", 881 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || 882 !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", 883 WQ_FREEZABLE, 0)) || 884#ifndef BCH_WRITE_REF_DEBUG 885 percpu_ref_init(&c->writes, bch2_writes_disabled, 886 PERCPU_REF_INIT_DEAD, GFP_KERNEL) || 887#endif 888 mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || 889 bioset_init(&c->btree_bio, 1, 890 max(offsetof(struct btree_read_bio, bio), 891 offsetof(struct btree_write_bio, wbio.bio)), 892 BIOSET_NEED_BVECS) || 893 !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || 894 !(c->usage = alloc_percpu(struct bch_fs_usage_base)) || 895 !(c->online_reserved = alloc_percpu(u64)) || 896 mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, 897 c->opts.btree_node_size) || 898 mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) { 899 ret = -BCH_ERR_ENOMEM_fs_other_alloc; 900 goto err; 901 } 902 903 ret = bch2_fs_counters_init(c) ?: 904 bch2_fs_sb_errors_init(c) ?: 905 bch2_io_clock_init(&c->io_clock[READ]) ?: 906 bch2_io_clock_init(&c->io_clock[WRITE]) ?: 907 bch2_fs_journal_init(&c->journal) ?: 908 bch2_fs_btree_iter_init(c) ?: 909 bch2_fs_btree_cache_init(c) ?: 910 bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: 911 bch2_fs_btree_interior_update_init(c) ?: 912 bch2_fs_btree_gc_init(c) ?: 913 bch2_fs_buckets_waiting_for_journal_init(c) ?: 914 bch2_fs_btree_write_buffer_init(c) ?: 915 bch2_fs_subvolumes_init(c) ?: 916 bch2_fs_io_read_init(c) ?: 917 bch2_fs_io_write_init(c) ?: 918 bch2_fs_nocow_locking_init(c) ?: 919 bch2_fs_encryption_init(c) ?: 920 bch2_fs_compress_init(c) ?: 921 bch2_fs_ec_init(c) ?: 922 bch2_fs_vfs_init(c) ?: 923 bch2_fs_fsio_init(c) ?: 924 bch2_fs_fs_io_buffered_init(c) ?: 925 bch2_fs_fs_io_direct_init(c); 926 if (ret) 927 goto err; 928 929#ifdef CONFIG_UNICODE 930 /* Default encoding until we can potentially have more as an option. */ 931 c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); 932 if (IS_ERR(c->cf_encoding)) { 933 printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", 934 unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), 935 unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), 936 unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); 937 ret = -EINVAL; 938 goto err; 939 } 940 bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", 941 unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), 942 unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), 943 unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); 944#else 945 if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { 946 printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n"); 947 ret = -EINVAL; 948 goto err; 949 } 950#endif 951 952 for (i = 0; i < c->sb.nr_devices; i++) { 953 if (!bch2_member_exists(c->disk_sb.sb, i)) 954 continue; 955 ret = bch2_dev_alloc(c, i); 956 if (ret) 957 goto err; 958 } 959 960 bch2_journal_entry_res_resize(&c->journal, 961 &c->btree_root_journal_res, 962 BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX)); 963 bch2_journal_entry_res_resize(&c->journal, 964 &c->clock_journal_res, 965 (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); 966 967 mutex_lock(&bch_fs_list_lock); 968 ret = bch2_fs_online(c); 969 mutex_unlock(&bch_fs_list_lock); 970 971 if (ret) 972 goto err; 973out: 974 return c; 975err: 976 bch2_fs_free(c); 977 c = ERR_PTR(ret); 978 goto out; 979} 980 981noinline_for_stack 982static void print_mount_opts(struct bch_fs *c) 983{ 984 enum bch_opt_id i; 985 struct printbuf p = PRINTBUF; 986 bool first = true; 987 988 prt_str(&p, "starting version "); 989 bch2_version_to_text(&p, c->sb.version); 990 991 for (i = 0; i < bch2_opts_nr; i++) { 992 const struct bch_option *opt = &bch2_opt_table[i]; 993 u64 v = bch2_opt_get_by_id(&c->opts, i); 994 995 if (!(opt->flags & OPT_MOUNT)) 996 continue; 997 998 if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) 999 continue; 1000 1001 prt_str(&p, first ? " opts=" : ","); 1002 first = false; 1003 bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); 1004 } 1005 1006 if (c->sb.version_incompat_allowed != c->sb.version) { 1007 prt_printf(&p, "\n allowing incompatible features above "); 1008 bch2_version_to_text(&p, c->sb.version_incompat_allowed); 1009 } 1010 1011 bch_info(c, "%s", p.buf); 1012 printbuf_exit(&p); 1013} 1014 1015static bool bch2_fs_may_start(struct bch_fs *c) 1016{ 1017 struct bch_dev *ca; 1018 unsigned i, flags = 0; 1019 1020 if (c->opts.very_degraded) 1021 flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; 1022 1023 if (c->opts.degraded) 1024 flags |= BCH_FORCE_IF_DEGRADED; 1025 1026 if (!c->opts.degraded && 1027 !c->opts.very_degraded) { 1028 mutex_lock(&c->sb_lock); 1029 1030 for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { 1031 if (!bch2_member_exists(c->disk_sb.sb, i)) 1032 continue; 1033 1034 ca = bch2_dev_locked(c, i); 1035 1036 if (!bch2_dev_is_online(ca) && 1037 (ca->mi.state == BCH_MEMBER_STATE_rw || 1038 ca->mi.state == BCH_MEMBER_STATE_ro)) { 1039 mutex_unlock(&c->sb_lock); 1040 return false; 1041 } 1042 } 1043 mutex_unlock(&c->sb_lock); 1044 } 1045 1046 return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); 1047} 1048 1049int bch2_fs_start(struct bch_fs *c) 1050{ 1051 time64_t now = ktime_get_real_seconds(); 1052 int ret = 0; 1053 1054 print_mount_opts(c); 1055 1056 if (!bch2_fs_may_start(c)) 1057 return -BCH_ERR_insufficient_devices_to_start; 1058 1059 down_write(&c->state_lock); 1060 mutex_lock(&c->sb_lock); 1061 1062 BUG_ON(test_bit(BCH_FS_started, &c->flags)); 1063 1064 if (!bch2_sb_field_get_minsize(&c->disk_sb, ext, 1065 sizeof(struct bch_sb_field_ext) / sizeof(u64))) { 1066 mutex_unlock(&c->sb_lock); 1067 up_write(&c->state_lock); 1068 ret = -BCH_ERR_ENOSPC_sb; 1069 goto err; 1070 } 1071 1072 ret = bch2_sb_members_v2_init(c); 1073 if (ret) { 1074 mutex_unlock(&c->sb_lock); 1075 up_write(&c->state_lock); 1076 goto err; 1077 } 1078 1079 for_each_online_member(c, ca) 1080 bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); 1081 1082 mutex_unlock(&c->sb_lock); 1083 1084 for_each_rw_member(c, ca) 1085 bch2_dev_allocator_add(c, ca); 1086 bch2_recalc_capacity(c); 1087 up_write(&c->state_lock); 1088 1089 c->recovery_task = current; 1090 ret = BCH_SB_INITIALIZED(c->disk_sb.sb) 1091 ? bch2_fs_recovery(c) 1092 : bch2_fs_initialize(c); 1093 c->recovery_task = NULL; 1094 1095 if (ret) 1096 goto err; 1097 1098 ret = bch2_opts_check_may_set(c); 1099 if (ret) 1100 goto err; 1101 1102 if (bch2_fs_init_fault("fs_start")) { 1103 ret = -BCH_ERR_injected_fs_start; 1104 goto err; 1105 } 1106 1107 set_bit(BCH_FS_started, &c->flags); 1108 wake_up(&c->ro_ref_wait); 1109 1110 down_write(&c->state_lock); 1111 if (c->opts.read_only) 1112 bch2_fs_read_only(c); 1113 else if (!test_bit(BCH_FS_rw, &c->flags)) 1114 ret = bch2_fs_read_write(c); 1115 up_write(&c->state_lock); 1116 1117err: 1118 if (ret) 1119 bch_err_msg(c, ret, "starting filesystem"); 1120 else 1121 bch_verbose(c, "done starting filesystem"); 1122 return ret; 1123} 1124 1125static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) 1126{ 1127 struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); 1128 1129 if (le16_to_cpu(sb->block_size) != block_sectors(c)) 1130 return -BCH_ERR_mismatched_block_size; 1131 1132 if (le16_to_cpu(m.bucket_size) < 1133 BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) 1134 return -BCH_ERR_bucket_size_too_small; 1135 1136 return 0; 1137} 1138 1139static int bch2_dev_in_fs(struct bch_sb_handle *fs, 1140 struct bch_sb_handle *sb, 1141 struct bch_opts *opts) 1142{ 1143 if (fs == sb) 1144 return 0; 1145 1146 if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid)) 1147 return -BCH_ERR_device_not_a_member_of_filesystem; 1148 1149 if (!bch2_member_exists(fs->sb, sb->sb->dev_idx)) 1150 return -BCH_ERR_device_has_been_removed; 1151 1152 if (fs->sb->block_size != sb->sb->block_size) 1153 return -BCH_ERR_mismatched_block_size; 1154 1155 if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq || 1156 le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq) 1157 return 0; 1158 1159 if (fs->sb->seq == sb->sb->seq && 1160 fs->sb->write_time != sb->sb->write_time) { 1161 struct printbuf buf = PRINTBUF; 1162 1163 prt_str(&buf, "Split brain detected between "); 1164 prt_bdevname(&buf, sb->bdev); 1165 prt_str(&buf, " and "); 1166 prt_bdevname(&buf, fs->bdev); 1167 prt_char(&buf, ':'); 1168 prt_newline(&buf); 1169 prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq)); 1170 prt_newline(&buf); 1171 1172 prt_bdevname(&buf, fs->bdev); 1173 prt_char(&buf, ' '); 1174 bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time)); 1175 prt_newline(&buf); 1176 1177 prt_bdevname(&buf, sb->bdev); 1178 prt_char(&buf, ' '); 1179 bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time)); 1180 prt_newline(&buf); 1181 1182 if (!opts->no_splitbrain_check) 1183 prt_printf(&buf, "Not using older sb"); 1184 1185 pr_err("%s", buf.buf); 1186 printbuf_exit(&buf); 1187 1188 if (!opts->no_splitbrain_check) 1189 return -BCH_ERR_device_splitbrain; 1190 } 1191 1192 struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx); 1193 u64 seq_from_fs = le64_to_cpu(m.seq); 1194 u64 seq_from_member = le64_to_cpu(sb->sb->seq); 1195 1196 if (seq_from_fs && seq_from_fs < seq_from_member) { 1197 struct printbuf buf = PRINTBUF; 1198 1199 prt_str(&buf, "Split brain detected between "); 1200 prt_bdevname(&buf, sb->bdev); 1201 prt_str(&buf, " and "); 1202 prt_bdevname(&buf, fs->bdev); 1203 prt_char(&buf, ':'); 1204 prt_newline(&buf); 1205 1206 prt_bdevname(&buf, fs->bdev); 1207 prt_str(&buf, " believes seq of "); 1208 prt_bdevname(&buf, sb->bdev); 1209 prt_printf(&buf, " to be %llu, but ", seq_from_fs); 1210 prt_bdevname(&buf, sb->bdev); 1211 prt_printf(&buf, " has %llu\n", seq_from_member); 1212 1213 if (!opts->no_splitbrain_check) { 1214 prt_str(&buf, "Not using "); 1215 prt_bdevname(&buf, sb->bdev); 1216 } 1217 1218 pr_err("%s", buf.buf); 1219 printbuf_exit(&buf); 1220 1221 if (!opts->no_splitbrain_check) 1222 return -BCH_ERR_device_splitbrain; 1223 } 1224 1225 return 0; 1226} 1227 1228/* Device startup/shutdown: */ 1229 1230static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw) 1231{ 1232 if (!percpu_ref_is_zero(&ca->io_ref[rw])) { 1233 reinit_completion(&ca->io_ref_completion[rw]); 1234 percpu_ref_kill(&ca->io_ref[rw]); 1235 wait_for_completion(&ca->io_ref_completion[rw]); 1236 } 1237} 1238 1239static void bch2_dev_release(struct kobject *kobj) 1240{ 1241 struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); 1242 1243 kfree(ca); 1244} 1245 1246static void bch2_dev_free(struct bch_dev *ca) 1247{ 1248 WARN_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE])); 1249 WARN_ON(!percpu_ref_is_zero(&ca->io_ref[READ])); 1250 1251 cancel_work_sync(&ca->io_error_work); 1252 1253 bch2_dev_unlink(ca); 1254 1255 if (ca->kobj.state_in_sysfs) 1256 kobject_del(&ca->kobj); 1257 1258 bch2_free_super(&ca->disk_sb); 1259 bch2_dev_allocator_background_exit(ca); 1260 bch2_dev_journal_exit(ca); 1261 1262 free_percpu(ca->io_done); 1263 bch2_dev_buckets_free(ca); 1264 kfree(ca->sb_read_scratch); 1265 1266 bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); 1267 bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); 1268 1269 percpu_ref_exit(&ca->io_ref[WRITE]); 1270 percpu_ref_exit(&ca->io_ref[READ]); 1271#ifndef CONFIG_BCACHEFS_DEBUG 1272 percpu_ref_exit(&ca->ref); 1273#endif 1274 kobject_put(&ca->kobj); 1275} 1276 1277static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) 1278{ 1279 1280 lockdep_assert_held(&c->state_lock); 1281 1282 if (percpu_ref_is_zero(&ca->io_ref[READ])) 1283 return; 1284 1285 __bch2_dev_read_only(c, ca); 1286 1287 bch2_dev_io_ref_stop(ca, READ); 1288 1289 bch2_dev_unlink(ca); 1290 1291 bch2_free_super(&ca->disk_sb); 1292 bch2_dev_journal_exit(ca); 1293} 1294 1295#ifndef CONFIG_BCACHEFS_DEBUG 1296static void bch2_dev_ref_complete(struct percpu_ref *ref) 1297{ 1298 struct bch_dev *ca = container_of(ref, struct bch_dev, ref); 1299 1300 complete(&ca->ref_completion); 1301} 1302#endif 1303 1304static void bch2_dev_io_ref_read_complete(struct percpu_ref *ref) 1305{ 1306 struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[READ]); 1307 1308 complete(&ca->io_ref_completion[READ]); 1309} 1310 1311static void bch2_dev_io_ref_write_complete(struct percpu_ref *ref) 1312{ 1313 struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[WRITE]); 1314 1315 complete(&ca->io_ref_completion[WRITE]); 1316} 1317 1318static void bch2_dev_unlink(struct bch_dev *ca) 1319{ 1320 struct kobject *b; 1321 1322 /* 1323 * This is racy w.r.t. the underlying block device being hot-removed, 1324 * which removes it from sysfs. 1325 * 1326 * It'd be lovely if we had a way to handle this race, but the sysfs 1327 * code doesn't appear to provide a good method and block/holder.c is 1328 * susceptible as well: 1329 */ 1330 if (ca->kobj.state_in_sysfs && 1331 ca->disk_sb.bdev && 1332 (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) { 1333 sysfs_remove_link(b, "bcachefs"); 1334 sysfs_remove_link(&ca->kobj, "block"); 1335 } 1336} 1337 1338static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) 1339{ 1340 int ret; 1341 1342 if (!c->kobj.state_in_sysfs) 1343 return 0; 1344 1345 if (!ca->kobj.state_in_sysfs) { 1346 ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx) ?: 1347 bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE); 1348 if (ret) 1349 return ret; 1350 } 1351 1352 if (ca->disk_sb.bdev) { 1353 struct kobject *block = bdev_kobj(ca->disk_sb.bdev); 1354 1355 ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); 1356 if (ret) 1357 return ret; 1358 1359 ret = sysfs_create_link(&ca->kobj, block, "block"); 1360 if (ret) 1361 return ret; 1362 } 1363 1364 return 0; 1365} 1366 1367static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, 1368 struct bch_member *member) 1369{ 1370 struct bch_dev *ca; 1371 unsigned i; 1372 1373 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 1374 if (!ca) 1375 return NULL; 1376 1377 kobject_init(&ca->kobj, &bch2_dev_ktype); 1378 init_completion(&ca->ref_completion); 1379 init_completion(&ca->io_ref_completion[READ]); 1380 init_completion(&ca->io_ref_completion[WRITE]); 1381 1382 INIT_WORK(&ca->io_error_work, bch2_io_error_work); 1383 1384 bch2_time_stats_quantiles_init(&ca->io_latency[READ]); 1385 bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]); 1386 1387 ca->mi = bch2_mi_to_cpu(member); 1388 1389 for (i = 0; i < ARRAY_SIZE(member->errors); i++) 1390 atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i])); 1391 1392 ca->uuid = member->uuid; 1393 1394 ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, 1395 ca->mi.bucket_size / btree_sectors(c)); 1396 1397#ifndef CONFIG_BCACHEFS_DEBUG 1398 if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL)) 1399 goto err; 1400#else 1401 atomic_long_set(&ca->ref, 1); 1402#endif 1403 1404 bch2_dev_allocator_background_init(ca); 1405 1406 if (percpu_ref_init(&ca->io_ref[READ], bch2_dev_io_ref_read_complete, 1407 PERCPU_REF_INIT_DEAD, GFP_KERNEL) || 1408 percpu_ref_init(&ca->io_ref[WRITE], bch2_dev_io_ref_write_complete, 1409 PERCPU_REF_INIT_DEAD, GFP_KERNEL) || 1410 !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) || 1411 bch2_dev_buckets_alloc(c, ca) || 1412 !(ca->io_done = alloc_percpu(*ca->io_done))) 1413 goto err; 1414 1415 return ca; 1416err: 1417 bch2_dev_free(ca); 1418 return NULL; 1419} 1420 1421static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, 1422 unsigned dev_idx) 1423{ 1424 ca->dev_idx = dev_idx; 1425 __set_bit(ca->dev_idx, ca->self.d); 1426 scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); 1427 1428 ca->fs = c; 1429 rcu_assign_pointer(c->devs[ca->dev_idx], ca); 1430 1431 if (bch2_dev_sysfs_online(c, ca)) 1432 pr_warn("error creating sysfs objects"); 1433} 1434 1435static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) 1436{ 1437 struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx); 1438 struct bch_dev *ca = NULL; 1439 1440 if (bch2_fs_init_fault("dev_alloc")) 1441 goto err; 1442 1443 ca = __bch2_dev_alloc(c, &member); 1444 if (!ca) 1445 goto err; 1446 1447 ca->fs = c; 1448 1449 bch2_dev_attach(c, ca, dev_idx); 1450 return 0; 1451err: 1452 return -BCH_ERR_ENOMEM_dev_alloc; 1453} 1454 1455static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) 1456{ 1457 unsigned ret; 1458 1459 if (bch2_dev_is_online(ca)) { 1460 bch_err(ca, "already have device online in slot %u", 1461 sb->sb->dev_idx); 1462 return -BCH_ERR_device_already_online; 1463 } 1464 1465 if (get_capacity(sb->bdev->bd_disk) < 1466 ca->mi.bucket_size * ca->mi.nbuckets) { 1467 bch_err(ca, "cannot online: device too small"); 1468 return -BCH_ERR_device_size_too_small; 1469 } 1470 1471 BUG_ON(!percpu_ref_is_zero(&ca->io_ref[READ])); 1472 BUG_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE])); 1473 1474 ret = bch2_dev_journal_init(ca, sb->sb); 1475 if (ret) 1476 return ret; 1477 1478 /* Commit: */ 1479 ca->disk_sb = *sb; 1480 memset(sb, 0, sizeof(*sb)); 1481 1482 /* 1483 * Stash pointer to the filesystem for blk_holder_ops - note that once 1484 * attached to a filesystem, we will always close the block device 1485 * before tearing down the filesystem object. 1486 */ 1487 ca->disk_sb.holder->c = ca->fs; 1488 1489 ca->dev = ca->disk_sb.bdev->bd_dev; 1490 1491 percpu_ref_reinit(&ca->io_ref[READ]); 1492 1493 return 0; 1494} 1495 1496static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) 1497{ 1498 struct bch_dev *ca; 1499 int ret; 1500 1501 lockdep_assert_held(&c->state_lock); 1502 1503 if (le64_to_cpu(sb->sb->seq) > 1504 le64_to_cpu(c->disk_sb.sb->seq)) 1505 bch2_sb_to_fs(c, sb->sb); 1506 1507 BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx)); 1508 1509 ca = bch2_dev_locked(c, sb->sb->dev_idx); 1510 1511 ret = __bch2_dev_attach_bdev(ca, sb); 1512 if (ret) 1513 return ret; 1514 1515 bch2_dev_sysfs_online(c, ca); 1516 1517 struct printbuf name = PRINTBUF; 1518 prt_bdevname(&name, ca->disk_sb.bdev); 1519 1520 if (c->sb.nr_devices == 1) 1521 strscpy(c->name, name.buf, sizeof(c->name)); 1522 strscpy(ca->name, name.buf, sizeof(ca->name)); 1523 1524 printbuf_exit(&name); 1525 1526 bch2_rebalance_wakeup(c); 1527 return 0; 1528} 1529 1530/* Device management: */ 1531 1532/* 1533 * Note: this function is also used by the error paths - when a particular 1534 * device sees an error, we call it to determine whether we can just set the 1535 * device RO, or - if this function returns false - we'll set the whole 1536 * filesystem RO: 1537 * 1538 * XXX: maybe we should be more explicit about whether we're changing state 1539 * because we got an error or what have you? 1540 */ 1541bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, 1542 enum bch_member_state new_state, int flags) 1543{ 1544 struct bch_devs_mask new_online_devs; 1545 int nr_rw = 0, required; 1546 1547 lockdep_assert_held(&c->state_lock); 1548 1549 switch (new_state) { 1550 case BCH_MEMBER_STATE_rw: 1551 return true; 1552 case BCH_MEMBER_STATE_ro: 1553 if (ca->mi.state != BCH_MEMBER_STATE_rw) 1554 return true; 1555 1556 /* do we have enough devices to write to? */ 1557 for_each_member_device(c, ca2) 1558 if (ca2 != ca) 1559 nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw; 1560 1561 required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) 1562 ? c->opts.metadata_replicas 1563 : metadata_replicas_required(c), 1564 !(flags & BCH_FORCE_IF_DATA_DEGRADED) 1565 ? c->opts.data_replicas 1566 : data_replicas_required(c)); 1567 1568 return nr_rw >= required; 1569 case BCH_MEMBER_STATE_failed: 1570 case BCH_MEMBER_STATE_spare: 1571 if (ca->mi.state != BCH_MEMBER_STATE_rw && 1572 ca->mi.state != BCH_MEMBER_STATE_ro) 1573 return true; 1574 1575 /* do we have enough devices to read from? */ 1576 new_online_devs = bch2_online_devs(c); 1577 __clear_bit(ca->dev_idx, new_online_devs.d); 1578 1579 return bch2_have_enough_devs(c, new_online_devs, flags, false); 1580 default: 1581 BUG(); 1582 } 1583} 1584 1585static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) 1586{ 1587 bch2_dev_io_ref_stop(ca, WRITE); 1588 1589 /* 1590 * The allocator thread itself allocates btree nodes, so stop it first: 1591 */ 1592 bch2_dev_allocator_remove(c, ca); 1593 bch2_recalc_capacity(c); 1594 bch2_dev_journal_stop(&c->journal, ca); 1595} 1596 1597static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) 1598{ 1599 lockdep_assert_held(&c->state_lock); 1600 1601 BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw); 1602 1603 bch2_dev_allocator_add(c, ca); 1604 bch2_recalc_capacity(c); 1605 1606 if (percpu_ref_is_zero(&ca->io_ref[WRITE])) 1607 percpu_ref_reinit(&ca->io_ref[WRITE]); 1608 1609 bch2_dev_do_discards(ca); 1610} 1611 1612int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, 1613 enum bch_member_state new_state, int flags) 1614{ 1615 struct bch_member *m; 1616 int ret = 0; 1617 1618 if (ca->mi.state == new_state) 1619 return 0; 1620 1621 if (!bch2_dev_state_allowed(c, ca, new_state, flags)) 1622 return -BCH_ERR_device_state_not_allowed; 1623 1624 if (new_state != BCH_MEMBER_STATE_rw) 1625 __bch2_dev_read_only(c, ca); 1626 1627 bch_notice(ca, "%s", bch2_member_states[new_state]); 1628 1629 mutex_lock(&c->sb_lock); 1630 m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 1631 SET_BCH_MEMBER_STATE(m, new_state); 1632 bch2_write_super(c); 1633 mutex_unlock(&c->sb_lock); 1634 1635 if (new_state == BCH_MEMBER_STATE_rw) 1636 __bch2_dev_read_write(c, ca); 1637 1638 bch2_rebalance_wakeup(c); 1639 1640 return ret; 1641} 1642 1643int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, 1644 enum bch_member_state new_state, int flags) 1645{ 1646 int ret; 1647 1648 down_write(&c->state_lock); 1649 ret = __bch2_dev_set_state(c, ca, new_state, flags); 1650 up_write(&c->state_lock); 1651 1652 return ret; 1653} 1654 1655/* Device add/removal: */ 1656 1657int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) 1658{ 1659 struct bch_member *m; 1660 unsigned dev_idx = ca->dev_idx, data; 1661 int ret; 1662 1663 down_write(&c->state_lock); 1664 1665 /* 1666 * We consume a reference to ca->ref, regardless of whether we succeed 1667 * or fail: 1668 */ 1669 bch2_dev_put(ca); 1670 1671 if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { 1672 bch_err(ca, "Cannot remove without losing data"); 1673 ret = -BCH_ERR_device_state_not_allowed; 1674 goto err; 1675 } 1676 1677 __bch2_dev_read_only(c, ca); 1678 1679 ret = bch2_dev_data_drop(c, ca->dev_idx, flags); 1680 bch_err_msg(ca, ret, "bch2_dev_data_drop()"); 1681 if (ret) 1682 goto err; 1683 1684 ret = bch2_dev_remove_alloc(c, ca); 1685 bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); 1686 if (ret) 1687 goto err; 1688 1689 /* 1690 * We need to flush the entire journal to get rid of keys that reference 1691 * the device being removed before removing the superblock entry 1692 */ 1693 bch2_journal_flush_all_pins(&c->journal); 1694 1695 /* 1696 * this is really just needed for the bch2_replicas_gc_(start|end) 1697 * calls, and could be cleaned up: 1698 */ 1699 ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); 1700 bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()"); 1701 if (ret) 1702 goto err; 1703 1704 ret = bch2_journal_flush(&c->journal); 1705 bch_err_msg(ca, ret, "bch2_journal_flush()"); 1706 if (ret) 1707 goto err; 1708 1709 ret = bch2_replicas_gc2(c); 1710 bch_err_msg(ca, ret, "bch2_replicas_gc2()"); 1711 if (ret) 1712 goto err; 1713 1714 data = bch2_dev_has_data(c, ca); 1715 if (data) { 1716 struct printbuf data_has = PRINTBUF; 1717 1718 prt_bitflags(&data_has, __bch2_data_types, data); 1719 bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); 1720 printbuf_exit(&data_has); 1721 ret = -EBUSY; 1722 goto err; 1723 } 1724 1725 __bch2_dev_offline(c, ca); 1726 1727 mutex_lock(&c->sb_lock); 1728 rcu_assign_pointer(c->devs[ca->dev_idx], NULL); 1729 mutex_unlock(&c->sb_lock); 1730 1731#ifndef CONFIG_BCACHEFS_DEBUG 1732 percpu_ref_kill(&ca->ref); 1733#else 1734 ca->dying = true; 1735 bch2_dev_put(ca); 1736#endif 1737 wait_for_completion(&ca->ref_completion); 1738 1739 bch2_dev_free(ca); 1740 1741 /* 1742 * Free this device's slot in the bch_member array - all pointers to 1743 * this device must be gone: 1744 */ 1745 mutex_lock(&c->sb_lock); 1746 m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); 1747 memset(&m->uuid, 0, sizeof(m->uuid)); 1748 1749 bch2_write_super(c); 1750 1751 mutex_unlock(&c->sb_lock); 1752 up_write(&c->state_lock); 1753 return 0; 1754err: 1755 if (test_bit(BCH_FS_rw, &c->flags) && 1756 ca->mi.state == BCH_MEMBER_STATE_rw && 1757 !percpu_ref_is_zero(&ca->io_ref[READ])) 1758 __bch2_dev_read_write(c, ca); 1759 up_write(&c->state_lock); 1760 return ret; 1761} 1762 1763/* Add new device to running filesystem: */ 1764int bch2_dev_add(struct bch_fs *c, const char *path) 1765{ 1766 struct bch_opts opts = bch2_opts_empty(); 1767 struct bch_sb_handle sb; 1768 struct bch_dev *ca = NULL; 1769 struct printbuf errbuf = PRINTBUF; 1770 struct printbuf label = PRINTBUF; 1771 int ret; 1772 1773 ret = bch2_read_super(path, &opts, &sb); 1774 bch_err_msg(c, ret, "reading super"); 1775 if (ret) 1776 goto err; 1777 1778 struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); 1779 1780 if (BCH_MEMBER_GROUP(&dev_mi)) { 1781 bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); 1782 if (label.allocation_failure) { 1783 ret = -ENOMEM; 1784 goto err; 1785 } 1786 } 1787 1788 ret = bch2_dev_may_add(sb.sb, c); 1789 if (ret) 1790 goto err; 1791 1792 ca = __bch2_dev_alloc(c, &dev_mi); 1793 if (!ca) { 1794 ret = -ENOMEM; 1795 goto err; 1796 } 1797 1798 ret = __bch2_dev_attach_bdev(ca, &sb); 1799 if (ret) 1800 goto err; 1801 1802 down_write(&c->state_lock); 1803 mutex_lock(&c->sb_lock); 1804 1805 ret = bch2_sb_from_fs(c, ca); 1806 bch_err_msg(c, ret, "setting up new superblock"); 1807 if (ret) 1808 goto err_unlock; 1809 1810 if (dynamic_fault("bcachefs:add:no_slot")) 1811 goto err_unlock; 1812 1813 ret = bch2_sb_member_alloc(c); 1814 if (ret < 0) { 1815 bch_err_msg(c, ret, "setting up new superblock"); 1816 goto err_unlock; 1817 } 1818 unsigned dev_idx = ret; 1819 1820 /* success: */ 1821 1822 dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds()); 1823 *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi; 1824 1825 ca->disk_sb.sb->dev_idx = dev_idx; 1826 bch2_dev_attach(c, ca, dev_idx); 1827 1828 if (BCH_MEMBER_GROUP(&dev_mi)) { 1829 ret = __bch2_dev_group_set(c, ca, label.buf); 1830 bch_err_msg(c, ret, "creating new label"); 1831 if (ret) 1832 goto err_unlock; 1833 } 1834 1835 bch2_write_super(c); 1836 mutex_unlock(&c->sb_lock); 1837 1838 ret = bch2_dev_usage_init(ca, false); 1839 if (ret) 1840 goto err_late; 1841 1842 ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); 1843 bch_err_msg(ca, ret, "marking new superblock"); 1844 if (ret) 1845 goto err_late; 1846 1847 ret = bch2_fs_freespace_init(c); 1848 bch_err_msg(ca, ret, "initializing free space"); 1849 if (ret) 1850 goto err_late; 1851 1852 if (ca->mi.state == BCH_MEMBER_STATE_rw) 1853 __bch2_dev_read_write(c, ca); 1854 1855 ret = bch2_dev_journal_alloc(ca, false); 1856 bch_err_msg(c, ret, "allocating journal"); 1857 if (ret) 1858 goto err_late; 1859 1860 up_write(&c->state_lock); 1861out: 1862 printbuf_exit(&label); 1863 printbuf_exit(&errbuf); 1864 bch_err_fn(c, ret); 1865 return ret; 1866 1867err_unlock: 1868 mutex_unlock(&c->sb_lock); 1869 up_write(&c->state_lock); 1870err: 1871 if (ca) 1872 bch2_dev_free(ca); 1873 bch2_free_super(&sb); 1874 goto out; 1875err_late: 1876 up_write(&c->state_lock); 1877 ca = NULL; 1878 goto err; 1879} 1880 1881/* Hot add existing device to running filesystem: */ 1882int bch2_dev_online(struct bch_fs *c, const char *path) 1883{ 1884 struct bch_opts opts = bch2_opts_empty(); 1885 struct bch_sb_handle sb = { NULL }; 1886 struct bch_dev *ca; 1887 unsigned dev_idx; 1888 int ret; 1889 1890 down_write(&c->state_lock); 1891 1892 ret = bch2_read_super(path, &opts, &sb); 1893 if (ret) { 1894 up_write(&c->state_lock); 1895 return ret; 1896 } 1897 1898 dev_idx = sb.sb->dev_idx; 1899 1900 ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts); 1901 bch_err_msg(c, ret, "bringing %s online", path); 1902 if (ret) 1903 goto err; 1904 1905 ret = bch2_dev_attach_bdev(c, &sb); 1906 if (ret) 1907 goto err; 1908 1909 ca = bch2_dev_locked(c, dev_idx); 1910 1911 ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); 1912 bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); 1913 if (ret) 1914 goto err; 1915 1916 if (ca->mi.state == BCH_MEMBER_STATE_rw) 1917 __bch2_dev_read_write(c, ca); 1918 1919 if (!ca->mi.freespace_initialized) { 1920 ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); 1921 bch_err_msg(ca, ret, "initializing free space"); 1922 if (ret) 1923 goto err; 1924 } 1925 1926 if (!ca->journal.nr) { 1927 ret = bch2_dev_journal_alloc(ca, false); 1928 bch_err_msg(ca, ret, "allocating journal"); 1929 if (ret) 1930 goto err; 1931 } 1932 1933 mutex_lock(&c->sb_lock); 1934 bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = 1935 cpu_to_le64(ktime_get_real_seconds()); 1936 bch2_write_super(c); 1937 mutex_unlock(&c->sb_lock); 1938 1939 up_write(&c->state_lock); 1940 return 0; 1941err: 1942 up_write(&c->state_lock); 1943 bch2_free_super(&sb); 1944 return ret; 1945} 1946 1947int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) 1948{ 1949 down_write(&c->state_lock); 1950 1951 if (!bch2_dev_is_online(ca)) { 1952 bch_err(ca, "Already offline"); 1953 up_write(&c->state_lock); 1954 return 0; 1955 } 1956 1957 if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { 1958 bch_err(ca, "Cannot offline required disk"); 1959 up_write(&c->state_lock); 1960 return -BCH_ERR_device_state_not_allowed; 1961 } 1962 1963 __bch2_dev_offline(c, ca); 1964 1965 up_write(&c->state_lock); 1966 return 0; 1967} 1968 1969int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) 1970{ 1971 struct bch_member *m; 1972 u64 old_nbuckets; 1973 int ret = 0; 1974 1975 down_write(&c->state_lock); 1976 old_nbuckets = ca->mi.nbuckets; 1977 1978 if (nbuckets < ca->mi.nbuckets) { 1979 bch_err(ca, "Cannot shrink yet"); 1980 ret = -EINVAL; 1981 goto err; 1982 } 1983 1984 if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) { 1985 bch_err(ca, "New device size too big (%llu greater than max %u)", 1986 nbuckets, BCH_MEMBER_NBUCKETS_MAX); 1987 ret = -BCH_ERR_device_size_too_big; 1988 goto err; 1989 } 1990 1991 if (bch2_dev_is_online(ca) && 1992 get_capacity(ca->disk_sb.bdev->bd_disk) < 1993 ca->mi.bucket_size * nbuckets) { 1994 bch_err(ca, "New size larger than device"); 1995 ret = -BCH_ERR_device_size_too_small; 1996 goto err; 1997 } 1998 1999 ret = bch2_dev_buckets_resize(c, ca, nbuckets); 2000 bch_err_msg(ca, ret, "resizing buckets"); 2001 if (ret) 2002 goto err; 2003 2004 ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); 2005 if (ret) 2006 goto err; 2007 2008 mutex_lock(&c->sb_lock); 2009 m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 2010 m->nbuckets = cpu_to_le64(nbuckets); 2011 2012 bch2_write_super(c); 2013 mutex_unlock(&c->sb_lock); 2014 2015 if (ca->mi.freespace_initialized) { 2016 u64 v[3] = { nbuckets - old_nbuckets, 0, 0 }; 2017 2018 ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0, 2019 bch2_disk_accounting_mod2(trans, false, v, dev_data_type, 2020 .dev = ca->dev_idx, 2021 .data_type = BCH_DATA_free)) ?: 2022 bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); 2023 if (ret) 2024 goto err; 2025 } 2026 2027 bch2_recalc_capacity(c); 2028err: 2029 up_write(&c->state_lock); 2030 return ret; 2031} 2032 2033/* return with ref on ca->ref: */ 2034struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) 2035{ 2036 if (!strncmp(name, "/dev/", strlen("/dev/"))) 2037 name += strlen("/dev/"); 2038 2039 for_each_member_device(c, ca) 2040 if (!strcmp(name, ca->name)) 2041 return ca; 2042 return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); 2043} 2044 2045/* blk_holder_ops: */ 2046 2047static struct bch_fs *bdev_get_fs(struct block_device *bdev) 2048 __releases(&bdev->bd_holder_lock) 2049{ 2050 struct bch_sb_handle_holder *holder = bdev->bd_holder; 2051 struct bch_fs *c = holder->c; 2052 2053 if (c && !bch2_ro_ref_tryget(c)) 2054 c = NULL; 2055 2056 mutex_unlock(&bdev->bd_holder_lock); 2057 2058 if (c) 2059 wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags)); 2060 return c; 2061} 2062 2063/* returns with ref on ca->ref */ 2064static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev) 2065{ 2066 for_each_member_device(c, ca) 2067 if (ca->disk_sb.bdev == bdev) 2068 return ca; 2069 return NULL; 2070} 2071 2072static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise) 2073{ 2074 struct bch_fs *c = bdev_get_fs(bdev); 2075 if (!c) 2076 return; 2077 2078 struct super_block *sb = c->vfs_sb; 2079 if (sb) { 2080 /* 2081 * Not necessary, c->ro_ref guards against the filesystem being 2082 * unmounted - we only take this to avoid a warning in 2083 * sync_filesystem: 2084 */ 2085 down_read(&sb->s_umount); 2086 } 2087 2088 down_write(&c->state_lock); 2089 struct bch_dev *ca = bdev_to_bch_dev(c, bdev); 2090 if (!ca) 2091 goto unlock; 2092 2093 if (bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, BCH_FORCE_IF_DEGRADED)) { 2094 __bch2_dev_offline(c, ca); 2095 } else { 2096 if (sb) { 2097 if (!surprise) 2098 sync_filesystem(sb); 2099 shrink_dcache_sb(sb); 2100 evict_inodes(sb); 2101 } 2102 2103 bch2_journal_flush(&c->journal); 2104 bch2_fs_emergency_read_only(c); 2105 } 2106 2107 bch2_dev_put(ca); 2108unlock: 2109 if (sb) 2110 up_read(&sb->s_umount); 2111 up_write(&c->state_lock); 2112 bch2_ro_ref_put(c); 2113} 2114 2115static void bch2_fs_bdev_sync(struct block_device *bdev) 2116{ 2117 struct bch_fs *c = bdev_get_fs(bdev); 2118 if (!c) 2119 return; 2120 2121 struct super_block *sb = c->vfs_sb; 2122 if (sb) { 2123 /* 2124 * Not necessary, c->ro_ref guards against the filesystem being 2125 * unmounted - we only take this to avoid a warning in 2126 * sync_filesystem: 2127 */ 2128 down_read(&sb->s_umount); 2129 sync_filesystem(sb); 2130 up_read(&sb->s_umount); 2131 } 2132 2133 bch2_ro_ref_put(c); 2134} 2135 2136const struct blk_holder_ops bch2_sb_handle_bdev_ops = { 2137 .mark_dead = bch2_fs_bdev_mark_dead, 2138 .sync = bch2_fs_bdev_sync, 2139}; 2140 2141/* Filesystem open: */ 2142 2143static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) 2144{ 2145 return cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?: 2146 cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time)); 2147} 2148 2149struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, 2150 struct bch_opts opts) 2151{ 2152 DARRAY(struct bch_sb_handle) sbs = { 0 }; 2153 struct bch_fs *c = NULL; 2154 struct bch_sb_handle *best = NULL; 2155 struct printbuf errbuf = PRINTBUF; 2156 int ret = 0; 2157 2158 if (!try_module_get(THIS_MODULE)) 2159 return ERR_PTR(-ENODEV); 2160 2161 if (!nr_devices) { 2162 ret = -EINVAL; 2163 goto err; 2164 } 2165 2166 ret = darray_make_room(&sbs, nr_devices); 2167 if (ret) 2168 goto err; 2169 2170 for (unsigned i = 0; i < nr_devices; i++) { 2171 struct bch_sb_handle sb = { NULL }; 2172 2173 ret = bch2_read_super(devices[i], &opts, &sb); 2174 if (ret) 2175 goto err; 2176 2177 BUG_ON(darray_push(&sbs, sb)); 2178 } 2179 2180 if (opts.nochanges && !opts.read_only) { 2181 ret = -BCH_ERR_erofs_nochanges; 2182 goto err_print; 2183 } 2184 2185 darray_for_each(sbs, sb) 2186 if (!best || sb_cmp(sb->sb, best->sb) > 0) 2187 best = sb; 2188 2189 darray_for_each_reverse(sbs, sb) { 2190 ret = bch2_dev_in_fs(best, sb, &opts); 2191 2192 if (ret == -BCH_ERR_device_has_been_removed || 2193 ret == -BCH_ERR_device_splitbrain) { 2194 bch2_free_super(sb); 2195 darray_remove_item(&sbs, sb); 2196 best -= best > sb; 2197 ret = 0; 2198 continue; 2199 } 2200 2201 if (ret) 2202 goto err_print; 2203 } 2204 2205 c = bch2_fs_alloc(best->sb, opts); 2206 ret = PTR_ERR_OR_ZERO(c); 2207 if (ret) 2208 goto err; 2209 2210 down_write(&c->state_lock); 2211 darray_for_each(sbs, sb) { 2212 ret = bch2_dev_attach_bdev(c, sb); 2213 if (ret) { 2214 up_write(&c->state_lock); 2215 goto err; 2216 } 2217 } 2218 up_write(&c->state_lock); 2219 2220 if (!c->opts.nostart) { 2221 ret = bch2_fs_start(c); 2222 if (ret) 2223 goto err; 2224 } 2225out: 2226 darray_for_each(sbs, sb) 2227 bch2_free_super(sb); 2228 darray_exit(&sbs); 2229 printbuf_exit(&errbuf); 2230 module_put(THIS_MODULE); 2231 return c; 2232err_print: 2233 pr_err("bch_fs_open err opening %s: %s", 2234 devices[0], bch2_err_str(ret)); 2235err: 2236 if (!IS_ERR_OR_NULL(c)) 2237 bch2_fs_stop(c); 2238 c = ERR_PTR(ret); 2239 goto out; 2240} 2241 2242/* Global interfaces/init */ 2243 2244static void bcachefs_exit(void) 2245{ 2246 bch2_debug_exit(); 2247 bch2_vfs_exit(); 2248 bch2_chardev_exit(); 2249 bch2_btree_key_cache_exit(); 2250 if (bcachefs_kset) 2251 kset_unregister(bcachefs_kset); 2252} 2253 2254static int __init bcachefs_init(void) 2255{ 2256 bch2_bkey_pack_test(); 2257 2258 if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || 2259 bch2_btree_key_cache_init() || 2260 bch2_chardev_init() || 2261 bch2_vfs_init() || 2262 bch2_debug_init()) 2263 goto err; 2264 2265 return 0; 2266err: 2267 bcachefs_exit(); 2268 return -ENOMEM; 2269} 2270 2271#define BCH_DEBUG_PARAM(name, description) \ 2272 bool bch2_##name; \ 2273 module_param_named(name, bch2_##name, bool, 0644); \ 2274 MODULE_PARM_DESC(name, description); 2275BCH_DEBUG_PARAMS() 2276#undef BCH_DEBUG_PARAM 2277 2278__maybe_unused 2279static unsigned bch2_metadata_version = bcachefs_metadata_version_current; 2280module_param_named(version, bch2_metadata_version, uint, 0444); 2281 2282module_exit(bcachefs_exit); 2283module_init(bcachefs_init);