bcachefs: Nocow support · tjh.dev/kernel@a8b3a67

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

bcachefs: Nocow support

This adds support for nocow mode, where we do writes in-place when
possible. Patch components:

- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled

- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.

Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.

- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.

- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.

- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.

XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush

XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

Kent Overstreet 2 years ago a8b3a677 4dcd1cae

+709 -52

22 changed files

expand all

bcachefs

Makefile

alloc_foreground.c

alloc_types.h

bcachefs.h

bcachefs_format.h

btree_io.c

checksum.h

data_update.c

extents.c

extents.h

fs-io.c

fs.h

inode.c

io.c

io.h

io_types.h

move.c

nocow_locking.c

nocow_locking.h

opts.h

super.h

trace.h

fs/bcachefs/Makefile

··· 52 52 migrate.o \ 53 53 move.o \ 54 54 movinggc.o \ 55 + nocow_locking.o \ 55 56 opts.o \ 56 57 printbuf.o \ 57 58 quota.o \

fs/bcachefs/alloc_foreground.c

··· 227 227 return NULL; 228 228 } 229 229 230 + if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) { 231 + s->skipped_nocow++; 232 + return NULL; 233 + } 234 + 230 235 spin_lock(&c->freelist_lock); 231 236 232 237 if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {

fs/bcachefs/alloc_types.h

··· 12 12 u64 buckets_seen; 13 13 u64 skipped_open; 14 14 u64 skipped_need_journal_commit; 15 + u64 skipped_nocow; 15 16 u64 skipped_nouse; 16 17 }; 17 18

+8 -2

fs/bcachefs/bcachefs.h

··· 206 206 #include "bcachefs_format.h" 207 207 #include "errcode.h" 208 208 #include "fifo.h" 209 + #include "nocow_locking.h" 209 210 #include "opts.h" 210 211 #include "util.h" 211 212 ··· 384 383 x(journal_flush_seq) \ 385 384 x(blocked_journal) \ 386 385 x(blocked_allocate) \ 387 - x(blocked_allocate_open_bucket) 386 + x(blocked_allocate_open_bucket) \ 387 + x(nocow_lock_contended) 388 388 389 389 enum bch_time_stats { 390 390 #define x(name) BCH_TIME_##name, ··· 485 483 struct bch_sb *sb_read_scratch; 486 484 int sb_write_error; 487 485 dev_t dev; 486 + atomic_t flush_seq; 488 487 489 488 struct bch_devs_mask self; 490 489 ··· 900 897 struct bio_set bio_read_split; 901 898 struct bio_set bio_write; 902 899 struct mutex bio_bounce_pages_lock; 903 - mempool_t bio_bounce_pages; 900 + mempool_t bio_bounce_pages; 901 + struct bucket_nocow_lock_table 902 + nocow_locks; 904 903 struct rhashtable promote_table; 905 904 906 905 mempool_t compression_bounce[2]; ··· 964 959 struct bio_set writepage_bioset; 965 960 struct bio_set dio_write_bioset; 966 961 struct bio_set dio_read_bioset; 962 + struct bio_set nocow_flush_bioset; 967 963 968 964 /* ERRORS */ 969 965 struct list_head fsck_errors;

+7 -3

fs/bcachefs/bcachefs_format.h

··· 798 798 x(bi_dir, 64) \ 799 799 x(bi_dir_offset, 64) \ 800 800 x(bi_subvol, 32) \ 801 - x(bi_parent_subvol, 32) 801 + x(bi_parent_subvol, 32) \ 802 + x(bi_nocow, 8) 802 803 803 804 /* subset of BCH_INODE_FIELDS */ 804 805 #define BCH_INODE_OPTS() \ ··· 811 810 x(promote_target, 16) \ 812 811 x(foreground_target, 16) \ 813 812 x(background_target, 16) \ 814 - x(erasure_code, 16) 813 + x(erasure_code, 16) \ 814 + x(nocow, 8) 815 815 816 816 enum inode_opt_id { 817 817 #define x(name, ...) \ ··· 1550 1548 x(alloc_v4, 20) \ 1551 1549 x(new_data_types, 21) \ 1552 1550 x(backpointers, 22) \ 1553 - x(inode_v3, 23) 1551 + x(inode_v3, 23) \ 1552 + x(unwritten_extents, 24) 1554 1553 1555 1554 enum bcachefs_metadata_version { 1556 1555 bcachefs_metadata_version_min = 9, ··· 1699 1696 LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); 1700 1697 LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); 1701 1698 LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); 1699 + LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); 1702 1700 LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE, struct bch_sb, flags[4], 34, 54); 1703 1701 1704 1702 /*

+2 -1

fs/bcachefs/btree_io.c

··· 1832 1832 bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr) 1833 1833 ptr->offset += wbio->sector_offset; 1834 1834 1835 - bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k); 1835 + bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, 1836 + &tmp.k, false); 1836 1837 } 1837 1838 1838 1839 void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)

+5 -2

fs/bcachefs/checksum.h

··· 99 99 } 100 100 101 101 static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, 102 - unsigned opt) 102 + struct bch_io_opts opts) 103 103 { 104 + if (opts.nocow) 105 + return 0; 106 + 104 107 if (c->sb.encryption_type) 105 108 return c->opts.wide_macs 106 109 ? BCH_CSUM_chacha20_poly1305_128 107 110 : BCH_CSUM_chacha20_poly1305_80; 108 111 109 - return bch2_csum_opt_to_type(opt, true); 112 + return bch2_csum_opt_to_type(opts.data_checksum, true); 110 113 } 111 114 112 115 static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)

+10

fs/bcachefs/data_update.c

··· 303 303 void bch2_data_update_exit(struct data_update *update) 304 304 { 305 305 struct bch_fs *c = update->op.c; 306 + struct bkey_ptrs_c ptrs = 307 + bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k)); 308 + const struct bch_extent_ptr *ptr; 309 + 310 + bkey_for_each_ptr(ptrs, ptr) 311 + bch2_bucket_nocow_unlock(&c->nocow_locks, 312 + PTR_BUCKET_POS(c, ptr), 0); 306 313 307 314 bch2_bkey_buf_exit(&update->k, c); 308 315 bch2_disk_reservation_put(c, &update->op.res); ··· 458 451 m->op.incompressible = true; 459 452 460 453 i++; 454 + 455 + bch2_bucket_nocow_lock(&c->nocow_locks, 456 + PTR_BUCKET_POS(c, &p.ptr), 0); 461 457 } 462 458 463 459 if (reserve_sectors) {

+23 -16

fs/bcachefs/extents.c

··· 664 664 return replicas; 665 665 } 666 666 667 - static unsigned bch2_extent_ptr_durability(struct bch_fs *c, 668 - struct extent_ptr_decoded p) 667 + unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) 669 668 { 670 669 unsigned durability = 0; 671 670 struct bch_dev *ca; 672 671 673 - if (p.ptr.cached) 672 + if (p->ptr.cached) 674 673 return 0; 675 674 676 - ca = bch_dev_bkey_exists(c, p.ptr.dev); 675 + ca = bch_dev_bkey_exists(c, p->ptr.dev); 677 676 678 677 if (ca->mi.state != BCH_MEMBER_STATE_failed) 679 678 durability = max_t(unsigned, durability, ca->mi.durability); 680 679 681 - if (p.has_ec) 682 - durability += p.ec.redundancy; 680 + if (p->has_ec) 681 + durability += p->ec.redundancy; 683 682 684 683 return durability; 685 684 } ··· 691 692 unsigned durability = 0; 692 693 693 694 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) 694 - durability += bch2_extent_ptr_durability(c, p); 695 + durability += bch2_extent_ptr_durability(c,& p); 695 696 696 697 return durability; 697 698 } ··· 906 907 */ 907 908 bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) 908 909 { 909 - struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1); 910 - struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); 911 - const union bch_extent_entry *entry1, *entry2; 912 - struct extent_ptr_decoded p1, p2; 913 - 914 - if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2)) 910 + if (k1.k->type != k2.k->type) 915 911 return false; 916 912 917 - bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) 918 - bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) 913 + if (bkey_extent_is_direct_data(k1.k)) { 914 + struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1); 915 + struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); 916 + const union bch_extent_entry *entry1, *entry2; 917 + struct extent_ptr_decoded p1, p2; 918 + 919 + if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2)) 920 + return false; 921 + 922 + bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) 923 + bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) 919 924 if (p1.ptr.dev == p2.ptr.dev && 920 925 p1.ptr.gen == p2.ptr.gen && 921 926 (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == 922 927 (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) 923 928 return true; 924 929 925 - return false; 930 + return false; 931 + } else { 932 + /* KEY_TYPE_deleted, etc. */ 933 + return true; 934 + } 926 935 } 927 936 928 937 bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1,

fs/bcachefs/extents.h

··· 596 596 unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); 597 597 598 598 unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); 599 + unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *); 599 600 unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); 600 601 601 602 void bch2_bkey_drop_device(struct bkey_s, unsigned);

+89 -9

fs/bcachefs/fs-io.c

··· 35 35 36 36 #include <trace/events/writeback.h> 37 37 38 + struct nocow_flush { 39 + struct closure *cl; 40 + struct bch_dev *ca; 41 + struct bio bio; 42 + }; 43 + 44 + static void nocow_flush_endio(struct bio *_bio) 45 + { 46 + 47 + struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); 48 + 49 + closure_put(bio->cl); 50 + percpu_ref_put(&bio->ca->io_ref); 51 + bio_put(&bio->bio); 52 + } 53 + 54 + static void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, 55 + struct bch_inode_info *inode, 56 + struct closure *cl) 57 + { 58 + struct nocow_flush *bio; 59 + struct bch_dev *ca; 60 + struct bch_devs_mask devs; 61 + unsigned dev; 62 + 63 + dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX); 64 + if (dev == BCH_SB_MEMBERS_MAX) 65 + return; 66 + 67 + devs = inode->ei_devs_need_flush; 68 + memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); 69 + 70 + for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { 71 + rcu_read_lock(); 72 + ca = rcu_dereference(c->devs[dev]); 73 + if (ca && !percpu_ref_tryget(&ca->io_ref)) 74 + ca = NULL; 75 + rcu_read_unlock(); 76 + 77 + if (!ca) 78 + continue; 79 + 80 + bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, 81 + REQ_OP_FLUSH, 82 + GFP_KERNEL, 83 + &c->nocow_flush_bioset), 84 + struct nocow_flush, bio); 85 + bio->cl = cl; 86 + bio->ca = ca; 87 + bio->bio.bi_end_io = nocow_flush_endio; 88 + closure_bio_submit(&bio->bio, cl); 89 + } 90 + } 91 + 92 + static int bch2_inode_flush_nocow_writes(struct bch_fs *c, 93 + struct bch_inode_info *inode) 94 + { 95 + struct closure cl; 96 + 97 + closure_init_stack(&cl); 98 + bch2_inode_flush_nocow_writes_async(c, inode, &cl); 99 + closure_sync(&cl); 100 + 101 + return 0; 102 + } 103 + 38 104 static inline bool bio_full(struct bio *bio, unsigned len) 39 105 { 40 106 if (bio->bi_vcnt >= bio->bi_max_vecs) ··· 1393 1327 op->subvol = inode->ei_subvol; 1394 1328 op->pos = POS(inode->v.i_ino, sector); 1395 1329 op->end_io = bch2_writepage_io_done; 1330 + op->devs_need_flush = &inode->ei_devs_need_flush; 1396 1331 op->wbio.bio.bi_iter.bi_sector = sector; 1397 1332 op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); 1398 1333 } ··· 2215 2148 2216 2149 if (!dio->op.error) { 2217 2150 ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode); 2218 - if (ret) 2151 + if (ret) { 2219 2152 dio->op.error = ret; 2220 - else 2153 + } else { 2221 2154 bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl); 2155 + bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl); 2156 + } 2222 2157 } 2223 2158 2224 2159 if (dio->sync) { ··· 2365 2296 dio->op.nr_replicas = dio->op.opts.data_replicas; 2366 2297 dio->op.subvol = inode->ei_subvol; 2367 2298 dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); 2299 + dio->op.devs_need_flush = &inode->ei_devs_need_flush; 2368 2300 2369 2301 if (sync) 2370 2302 dio->op.flags |= BCH_WRITE_SYNC; ··· 2565 2495 * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an 2566 2496 * insert trigger: look up the btree inode instead 2567 2497 */ 2568 - static int bch2_flush_inode(struct bch_fs *c, subvol_inum inum) 2498 + static int bch2_flush_inode(struct bch_fs *c, 2499 + struct bch_inode_info *inode) 2569 2500 { 2570 - struct bch_inode_unpacked inode; 2501 + struct bch_inode_unpacked u; 2571 2502 int ret; 2572 2503 2573 2504 if (c->opts.journal_flush_disabled) 2574 2505 return 0; 2575 2506 2576 - ret = bch2_inode_find_by_inum(c, inum, &inode); 2507 + ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u); 2577 2508 if (ret) 2578 2509 return ret; 2579 2510 2580 - return bch2_journal_flush_seq(&c->journal, inode.bi_journal_seq); 2511 + return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?: 2512 + bch2_inode_flush_nocow_writes(c, inode); 2581 2513 } 2582 2514 2583 2515 int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) ··· 2590 2518 2591 2519 ret = file_write_and_wait_range(file, start, end); 2592 2520 ret2 = sync_inode_metadata(&inode->v, 1); 2593 - ret3 = bch2_flush_inode(c, inode_inum(inode)); 2521 + ret3 = bch2_flush_inode(c, inode); 2594 2522 2595 2523 return bch2_err_class(ret ?: ret2 ?: ret3); 2596 2524 } ··· 3177 3105 continue; 3178 3106 } 3179 3107 3108 + /* 3109 + * XXX: for nocow mode, we should promote shared extents to 3110 + * unshared here 3111 + */ 3112 + 3180 3113 sectors = bpos_min(k.k->p, end_pos).offset - iter.pos.offset; 3181 3114 3182 3115 if (!bkey_extent_is_allocation(k.k)) { ··· 3445 3368 3446 3369 if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || 3447 3370 IS_SYNC(file_inode(file_dst))) 3448 - ret = bch2_flush_inode(c, inode_inum(dst)); 3371 + ret = bch2_flush_inode(c, dst); 3449 3372 err: 3450 3373 bch2_quota_reservation_put(c, dst, &quota_res); 3451 3374 bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); ··· 3699 3622 3700 3623 void bch2_fs_fsio_exit(struct bch_fs *c) 3701 3624 { 3625 + bioset_exit(&c->nocow_flush_bioset); 3702 3626 bioset_exit(&c->dio_write_bioset); 3703 3627 bioset_exit(&c->dio_read_bioset); 3704 3628 bioset_exit(&c->writepage_bioset); ··· 3719 3641 BIOSET_NEED_BVECS) || 3720 3642 bioset_init(&c->dio_write_bioset, 3721 3643 4, offsetof(struct dio_write, op.wbio.bio), 3722 - BIOSET_NEED_BVECS)) 3644 + BIOSET_NEED_BVECS) || 3645 + bioset_init(&c->nocow_flush_bioset, 3646 + 1, offsetof(struct nocow_flush, bio), 0)) 3723 3647 ret = -ENOMEM; 3724 3648 3725 3649 pr_verbose_init(c->opts, "ret %i", ret);

+11

fs/bcachefs/fs.h

··· 25 25 26 26 u32 ei_subvol; 27 27 28 + /* 29 + * When we've been doing nocow writes we'll need to issue flushes to the 30 + * underlying block devices 31 + * 32 + * XXX: a device may have had a flush issued by some other codepath. It 33 + * would be better to keep for each device a sequence number that's 34 + * incremented when we isusue a cache flush, and track here the sequence 35 + * number that needs flushing. 36 + */ 37 + struct bch_devs_mask ei_devs_need_flush; 38 + 28 39 /* copy of inode in btree: */ 29 40 struct bch_inode_unpacked ei_inode; 30 41 };

fs/bcachefs/inode.c

··· 892 892 #define x(_name, _bits) opts->_name = inode_opt_get(c, inode, _name); 893 893 BCH_INODE_OPTS() 894 894 #undef x 895 + 896 + if (opts->nocow) 897 + opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0; 895 898 }

+439 -13

fs/bcachefs/io.c

··· 34 34 #include "trace.h" 35 35 36 36 #include <linux/blkdev.h> 37 + #include <linux/prefetch.h> 37 38 #include <linux/random.h> 38 39 #include <linux/sched/mm.h> 39 40 ··· 376 375 s64 *i_sectors_delta, 377 376 struct write_point_specifier write_point) 378 377 { 379 - int ret; 380 378 struct bch_fs *c = trans->c; 381 379 struct disk_reservation disk_res = { 0 }; 382 - struct bkey_i_reservation *reservation = 383 - bch2_trans_kmalloc(trans, sizeof(*reservation)); 380 + struct closure cl; 381 + struct open_buckets open_buckets; 382 + struct bkey_s_c k; 383 + struct bkey_buf old, new; 384 + bool have_reservation = false; 385 + bool unwritten = opts.nocow && 386 + c->sb.version >= bcachefs_metadata_version_unwritten_extents; 387 + int ret; 384 388 385 - ret = PTR_ERR_OR_ZERO(reservation); 389 + bch2_bkey_buf_init(&old); 390 + bch2_bkey_buf_init(&new); 391 + closure_init_stack(&cl); 392 + open_buckets.nr = 0; 393 + retry: 394 + k = bch2_btree_iter_peek_slot(iter); 395 + ret = bkey_err(k); 386 396 if (ret) 387 397 return ret; 388 398 389 - bkey_reservation_init(&reservation->k_i); 390 - reservation->k.p = iter->pos; 391 - bch2_key_resize(&reservation->k, sectors); 392 - reservation->v.nr_replicas = opts.data_replicas; 399 + sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset); 393 400 394 - ret = bch2_extent_update(trans, inum, iter, &reservation->k_i, &disk_res, 401 + if (!have_reservation) { 402 + unsigned new_replicas = 403 + max(0, (int) opts.data_replicas - 404 + (int) bch2_bkey_nr_ptrs_fully_allocated(k)); 405 + /* 406 + * Get a disk reservation before (in the nocow case) calling 407 + * into the allocator: 408 + */ 409 + ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); 410 + if (unlikely(ret)) 411 + goto out; 412 + 413 + bch2_bkey_buf_reassemble(&old, c, k); 414 + } 415 + 416 + if (have_reservation) { 417 + if (!bch2_extents_match(k, bkey_i_to_s_c(old.k))) 418 + goto out; 419 + 420 + bch2_key_resize(&new.k->k, sectors); 421 + } else if (!unwritten) { 422 + struct bkey_i_reservation *reservation; 423 + 424 + bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64)); 425 + reservation = bkey_reservation_init(new.k); 426 + reservation->k.p = iter->pos; 427 + bch2_key_resize(&reservation->k, sectors); 428 + reservation->v.nr_replicas = opts.data_replicas; 429 + } else { 430 + struct bkey_i_extent *e; 431 + struct bch_devs_list devs_have; 432 + struct write_point *wp; 433 + struct bch_extent_ptr *ptr; 434 + 435 + devs_have.nr = 0; 436 + 437 + bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX); 438 + 439 + e = bkey_extent_init(new.k); 440 + e->k.p = iter->pos; 441 + 442 + ret = bch2_alloc_sectors_start_trans(trans, 443 + opts.foreground_target, 444 + false, 445 + write_point, 446 + &devs_have, 447 + opts.data_replicas, 448 + opts.data_replicas, 449 + RESERVE_none, 0, &cl, &wp); 450 + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) { 451 + bch2_trans_unlock(trans); 452 + closure_sync(&cl); 453 + goto retry; 454 + } 455 + if (ret) 456 + return ret; 457 + 458 + sectors = min(sectors, wp->sectors_free); 459 + 460 + bch2_key_resize(&e->k, sectors); 461 + 462 + bch2_open_bucket_get(c, wp, &open_buckets); 463 + bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); 464 + bch2_alloc_sectors_done(c, wp); 465 + 466 + extent_for_each_ptr(extent_i_to_s(e), ptr) 467 + ptr->unwritten = true; 468 + } 469 + 470 + have_reservation = true; 471 + 472 + ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, 395 473 0, i_sectors_delta, true); 474 + out: 475 + if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) { 476 + bch2_trans_unlock(trans); 477 + closure_sync(&cl); 478 + } 479 + 480 + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 481 + bch2_trans_begin(trans); 482 + goto retry; 483 + } 484 + 485 + bch2_open_buckets_put(c, &open_buckets); 396 486 bch2_disk_reservation_put(c, &disk_res); 487 + bch2_bkey_buf_exit(&new, c); 488 + bch2_bkey_buf_exit(&old, c); 489 + 397 490 return ret; 398 491 } 399 492 ··· 634 539 635 540 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, 636 541 enum bch_data_type type, 637 - const struct bkey_i *k) 542 + const struct bkey_i *k, 543 + bool nocow) 638 544 { 639 545 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); 640 546 const struct bch_extent_ptr *ptr; ··· 669 573 670 574 n->c = c; 671 575 n->dev = ptr->dev; 672 - n->have_ioref = bch2_dev_get_ioref(ca, 576 + n->have_ioref = nocow || bch2_dev_get_ioref(ca, 673 577 type == BCH_DATA_btree ? READ : WRITE); 578 + n->nocow = nocow; 674 579 n->submit_time = local_clock(); 675 580 n->inode_offset = bkey_start_offset(&k->k); 676 581 n->bio.bi_iter.bi_sector = ptr->offset; ··· 897 800 set_bit(wbio->dev, op->failed.d); 898 801 op->flags |= BCH_WRITE_IO_ERROR; 899 802 } 803 + 804 + if (wbio->nocow) 805 + set_bit(wbio->dev, op->devs_need_flush->d); 900 806 901 807 if (wbio->have_ioref) { 902 808 bch2_latency_acct(ca, wbio->submit_time, WRITE); ··· 1321 1221 return ret; 1322 1222 } 1323 1223 1224 + static bool bch2_extent_is_writeable(struct bch_write_op *op, 1225 + struct bkey_s_c k) 1226 + { 1227 + struct bch_fs *c = op->c; 1228 + struct bkey_s_c_extent e; 1229 + struct extent_ptr_decoded p; 1230 + const union bch_extent_entry *entry; 1231 + unsigned replicas = 0; 1232 + 1233 + if (k.k->type != KEY_TYPE_extent) 1234 + return false; 1235 + 1236 + e = bkey_s_c_to_extent(k); 1237 + extent_for_each_ptr_decode(e, p, entry) { 1238 + if (p.crc.csum_type || 1239 + crc_is_compressed(p.crc) || 1240 + p.has_ec) 1241 + return false; 1242 + 1243 + replicas += bch2_extent_ptr_durability(c, &p); 1244 + } 1245 + 1246 + return replicas >= op->opts.data_replicas; 1247 + } 1248 + 1249 + static inline void bch2_nocow_write_unlock(struct bch_write_op *op) 1250 + { 1251 + struct bch_fs *c = op->c; 1252 + const struct bch_extent_ptr *ptr; 1253 + struct bkey_i *k; 1254 + 1255 + for_each_keylist_key(&op->insert_keys, k) { 1256 + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); 1257 + 1258 + bkey_for_each_ptr(ptrs, ptr) 1259 + bch2_bucket_nocow_unlock(&c->nocow_locks, 1260 + PTR_BUCKET_POS(c, ptr), 1261 + BUCKET_NOCOW_LOCK_UPDATE); 1262 + } 1263 + } 1264 + 1265 + static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, 1266 + struct btree_iter *iter, 1267 + struct bkey_i *orig, 1268 + struct bkey_s_c k, 1269 + u64 new_i_size) 1270 + { 1271 + struct bkey_i *new; 1272 + struct bkey_ptrs ptrs; 1273 + struct bch_extent_ptr *ptr; 1274 + int ret; 1275 + 1276 + if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) { 1277 + /* trace this */ 1278 + return 0; 1279 + } 1280 + 1281 + new = bch2_bkey_make_mut(trans, k); 1282 + ret = PTR_ERR_OR_ZERO(new); 1283 + if (ret) 1284 + return ret; 1285 + 1286 + bch2_cut_front(bkey_start_pos(&orig->k), new); 1287 + bch2_cut_back(orig->k.p, new); 1288 + 1289 + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); 1290 + bkey_for_each_ptr(ptrs, ptr) 1291 + ptr->unwritten = 0; 1292 + 1293 + /* 1294 + * Note that we're not calling bch2_subvol_get_snapshot() in this path - 1295 + * that was done when we kicked off the write, and here it's important 1296 + * that we update the extent that we wrote to - even if a snapshot has 1297 + * since been created. The write is still outstanding, so we're ok 1298 + * w.r.t. snapshot atomicity: 1299 + */ 1300 + return bch2_extent_update_i_size_sectors(trans, iter, 1301 + min(new->k.p.offset << 9, new_i_size), 0) ?: 1302 + bch2_trans_update(trans, iter, new, 1303 + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 1304 + } 1305 + 1306 + static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) 1307 + { 1308 + struct bch_fs *c = op->c; 1309 + struct btree_trans trans; 1310 + struct btree_iter iter; 1311 + struct bkey_i *orig; 1312 + struct bkey_s_c k; 1313 + int ret; 1314 + 1315 + bch2_trans_init(&trans, c, 0, 0); 1316 + 1317 + for_each_keylist_key(&op->insert_keys, orig) { 1318 + ret = for_each_btree_key_upto_commit(&trans, iter, BTREE_ID_extents, 1319 + bkey_start_pos(&orig->k), orig->k.p, 1320 + BTREE_ITER_INTENT, k, 1321 + NULL, NULL, BTREE_INSERT_NOFAIL, ({ 1322 + bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size); 1323 + })); 1324 + 1325 + if (ret && !bch2_err_matches(ret, EROFS)) { 1326 + struct bkey_i *k = bch2_keylist_front(&op->insert_keys); 1327 + 1328 + bch_err_inum_offset_ratelimited(c, 1329 + k->k.p.inode, k->k.p.offset << 9, 1330 + "write error while doing btree update: %s", 1331 + bch2_err_str(ret)); 1332 + } 1333 + 1334 + if (ret) { 1335 + op->error = ret; 1336 + break; 1337 + } 1338 + } 1339 + 1340 + bch2_trans_exit(&trans); 1341 + } 1342 + 1343 + static void __bch2_nocow_write_done(struct bch_write_op *op) 1344 + { 1345 + bch2_nocow_write_unlock(op); 1346 + 1347 + if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { 1348 + op->error = -EIO; 1349 + } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) 1350 + bch2_nocow_write_convert_unwritten(op); 1351 + } 1352 + 1353 + static void bch2_nocow_write_done(struct closure *cl) 1354 + { 1355 + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); 1356 + 1357 + __bch2_nocow_write_done(op); 1358 + bch2_write_done(cl); 1359 + } 1360 + 1361 + static void bch2_nocow_write(struct bch_write_op *op) 1362 + { 1363 + struct bch_fs *c = op->c; 1364 + struct btree_trans trans; 1365 + struct btree_iter iter; 1366 + struct bkey_s_c k; 1367 + struct bkey_ptrs_c ptrs; 1368 + const struct bch_extent_ptr *ptr, *ptr2; 1369 + struct { 1370 + struct bpos b; 1371 + unsigned gen; 1372 + two_state_lock_t *l; 1373 + } buckets[BCH_REPLICAS_MAX]; 1374 + unsigned nr_buckets = 0; 1375 + u32 snapshot; 1376 + int ret, i; 1377 + 1378 + if (op->flags & BCH_WRITE_MOVE) 1379 + return; 1380 + 1381 + bch2_trans_init(&trans, c, 0, 0); 1382 + retry: 1383 + bch2_trans_begin(&trans); 1384 + 1385 + ret = bch2_subvolume_get_snapshot(&trans, op->subvol, &snapshot); 1386 + if (unlikely(ret)) 1387 + goto err; 1388 + 1389 + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, 1390 + SPOS(op->pos.inode, op->pos.offset, snapshot), 1391 + BTREE_ITER_SLOTS); 1392 + while (1) { 1393 + struct bio *bio = &op->wbio.bio; 1394 + 1395 + nr_buckets = 0; 1396 + 1397 + k = bch2_btree_iter_peek_slot(&iter); 1398 + ret = bkey_err(k); 1399 + if (ret) 1400 + break; 1401 + 1402 + /* fall back to normal cow write path? */ 1403 + if (unlikely(k.k->p.snapshot != snapshot || 1404 + !bch2_extent_is_writeable(op, k))) 1405 + break; 1406 + 1407 + if (bch2_keylist_realloc(&op->insert_keys, 1408 + op->inline_keys, 1409 + ARRAY_SIZE(op->inline_keys), 1410 + k.k->u64s)) 1411 + break; 1412 + 1413 + /* Get iorefs before dropping btree locks: */ 1414 + ptrs = bch2_bkey_ptrs_c(k); 1415 + bkey_for_each_ptr(ptrs, ptr) { 1416 + buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr); 1417 + buckets[nr_buckets].gen = ptr->gen; 1418 + buckets[nr_buckets].l = 1419 + bucket_nocow_lock(&c->nocow_locks, buckets[nr_buckets].b); 1420 + 1421 + prefetch(buckets[nr_buckets].l); 1422 + nr_buckets++; 1423 + 1424 + if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) 1425 + goto err_get_ioref; 1426 + 1427 + if (ptr->unwritten) 1428 + op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; 1429 + } 1430 + 1431 + /* Unlock before taking nocow locks, doing IO: */ 1432 + bkey_reassemble(op->insert_keys.top, k); 1433 + bch2_trans_unlock(&trans); 1434 + 1435 + bch2_cut_front(op->pos, op->insert_keys.top); 1436 + if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) 1437 + bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); 1438 + 1439 + for (i = 0; i < nr_buckets; i++) { 1440 + struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode); 1441 + two_state_lock_t *l = buckets[i].l; 1442 + bool stale; 1443 + 1444 + if (!bch2_two_state_trylock(l, BUCKET_NOCOW_LOCK_UPDATE)) 1445 + __bch2_bucket_nocow_lock(&c->nocow_locks, l, BUCKET_NOCOW_LOCK_UPDATE); 1446 + 1447 + rcu_read_lock(); 1448 + stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen); 1449 + rcu_read_unlock(); 1450 + 1451 + if (unlikely(stale)) 1452 + goto err_bucket_stale; 1453 + } 1454 + 1455 + bio = &op->wbio.bio; 1456 + if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) { 1457 + bio = bio_split(bio, k.k->p.offset - op->pos.offset, 1458 + GFP_KERNEL, &c->bio_write); 1459 + wbio_init(bio)->put_bio = true; 1460 + bio->bi_opf = op->wbio.bio.bi_opf; 1461 + } else { 1462 + op->flags |= BCH_WRITE_DONE; 1463 + } 1464 + 1465 + op->pos.offset += bio_sectors(bio); 1466 + op->written += bio_sectors(bio); 1467 + 1468 + bio->bi_end_io = bch2_write_endio; 1469 + bio->bi_private = &op->cl; 1470 + bio->bi_opf |= REQ_OP_WRITE; 1471 + closure_get(&op->cl); 1472 + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, 1473 + op->insert_keys.top, true); 1474 + 1475 + bch2_keylist_push(&op->insert_keys); 1476 + if (op->flags & BCH_WRITE_DONE) 1477 + break; 1478 + bch2_btree_iter_advance(&iter); 1479 + } 1480 + out: 1481 + bch2_trans_iter_exit(&trans, &iter); 1482 + err: 1483 + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1484 + goto retry; 1485 + 1486 + if (ret) { 1487 + bch_err_inum_offset_ratelimited(c, 1488 + op->pos.inode, 1489 + op->pos.offset << 9, 1490 + "%s: btree lookup error %s", 1491 + __func__, bch2_err_str(ret)); 1492 + op->error = ret; 1493 + op->flags |= BCH_WRITE_DONE; 1494 + } 1495 + 1496 + bch2_trans_exit(&trans); 1497 + 1498 + /* fallback to cow write path? */ 1499 + if (!(op->flags & BCH_WRITE_DONE)) { 1500 + closure_sync(&op->cl); 1501 + __bch2_nocow_write_done(op); 1502 + op->insert_keys.top = op->insert_keys.keys; 1503 + } else if (op->flags & BCH_WRITE_SYNC) { 1504 + closure_sync(&op->cl); 1505 + bch2_nocow_write_done(&op->cl); 1506 + } else { 1507 + /* 1508 + * XXX 1509 + * needs to run out of process context because ei_quota_lock is 1510 + * a mutex 1511 + */ 1512 + continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op)); 1513 + } 1514 + return; 1515 + err_get_ioref: 1516 + bkey_for_each_ptr(ptrs, ptr2) { 1517 + if (ptr2 == ptr) 1518 + break; 1519 + 1520 + percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref); 1521 + } 1522 + 1523 + /* Fall back to COW path: */ 1524 + goto out; 1525 + err_bucket_stale: 1526 + while (--i >= 0) 1527 + bch2_bucket_nocow_unlock(&c->nocow_locks, 1528 + buckets[i].b, 1529 + BUCKET_NOCOW_LOCK_UPDATE); 1530 + 1531 + bkey_for_each_ptr(ptrs, ptr2) 1532 + percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref); 1533 + 1534 + /* We can retry this: */ 1535 + ret = BCH_ERR_transaction_restart; 1536 + goto out; 1537 + } 1538 + 1324 1539 static void __bch2_write(struct bch_write_op *op) 1325 1540 { 1326 1541 struct bch_fs *c = op->c; ··· 1645 1230 int ret; 1646 1231 1647 1232 nofs_flags = memalloc_nofs_save(); 1233 + 1234 + if (unlikely(op->opts.nocow)) { 1235 + bch2_nocow_write(op); 1236 + if (op->flags & BCH_WRITE_DONE) 1237 + goto out_nofs_restore; 1238 + } 1648 1239 again: 1649 1240 memset(&op->failed, 0, sizeof(op->failed)); 1650 1241 op->btree_update_ready = false; ··· 1731 1310 key_to_write_offset); 1732 1311 1733 1312 bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, 1734 - key_to_write); 1313 + key_to_write, false); 1735 1314 } while (ret); 1736 1315 1737 1316 /* ··· 1753 1332 } else { 1754 1333 continue_at(&op->cl, bch2_write_index, NULL); 1755 1334 } 1756 - 1335 + out_nofs_restore: 1757 1336 memalloc_nofs_restore(nofs_flags); 1758 1337 } 1759 1338 ··· 2984 2563 2985 2564 int bch2_fs_io_init(struct bch_fs *c) 2986 2565 { 2566 + unsigned i; 2567 + 2568 + for (i = 0; i < ARRAY_SIZE(c->nocow_locks.l); i++) 2569 + two_state_lock_init(&c->nocow_locks.l[i]); 2570 + 2987 2571 if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), 2988 2572 BIOSET_NEED_BVECS) || 2989 2573 bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),

+5 -2

fs/bcachefs/io.h

··· 22 22 #endif 23 23 24 24 void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, 25 - enum bch_data_type, const struct bkey_i *); 25 + enum bch_data_type, const struct bkey_i *, bool); 26 26 27 27 #define BLK_STS_REMOVED ((__force blk_status_t)128) 28 28 ··· 43 43 __BCH_WRITE_IN_WORKER, 44 44 __BCH_WRITE_DONE, 45 45 __BCH_WRITE_IO_ERROR, 46 + __BCH_WRITE_CONVERT_UNWRITTEN, 46 47 }; 47 48 48 49 #define BCH_WRITE_ALLOC_NOWAIT (1U << __BCH_WRITE_ALLOC_NOWAIT) ··· 62 61 #define BCH_WRITE_IN_WORKER (1U << __BCH_WRITE_IN_WORKER) 63 62 #define BCH_WRITE_DONE (1U << __BCH_WRITE_DONE) 64 63 #define BCH_WRITE_IO_ERROR (1U << __BCH_WRITE_IO_ERROR) 64 + #define BCH_WRITE_CONVERT_UNWRITTEN (1U << __BCH_WRITE_CONVERT_UNWRITTEN) 65 65 66 66 static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) 67 67 { ··· 92 90 op->flags = 0; 93 91 op->written = 0; 94 92 op->error = 0; 95 - op->csum_type = bch2_data_checksum_type(c, opts.data_checksum); 93 + op->csum_type = bch2_data_checksum_type(c, opts); 96 94 op->compression_type = bch2_compression_opt_to_type[opts.compression]; 97 95 op->nr_replicas = 0; 98 96 op->nr_replicas_required = c->opts.data_replicas_required; ··· 109 107 op->res = (struct disk_reservation) { 0 }; 110 108 op->new_i_size = U64_MAX; 111 109 op->i_sectors_delta = 0; 110 + op->devs_need_flush = NULL; 112 111 } 113 112 114 113 void bch2_write(struct closure *);

fs/bcachefs/io_types.h

··· 97 97 bounce:1, 98 98 put_bio:1, 99 99 have_ioref:1, 100 + nocow:1, 100 101 used_mempool:1, 101 102 first_btree_write:1; 102 103 ); ··· 151 150 152 151 struct keylist insert_keys; 153 152 u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; 153 + 154 + /* 155 + * Bitmask of devices that have had nocow writes issued to them since 156 + * last flush: 157 + */ 158 + struct bch_devs_mask *devs_need_flush; 154 159 155 160 /* Must be last: */ 156 161 struct bch_write_bio wbio;

fs/bcachefs/move.c

··· 260 260 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_move)) 261 261 return -BCH_ERR_erofs_no_writes; 262 262 263 + /* 264 + * Before memory allocations & taking nocow locks in 265 + * bch2_data_update_init(): 266 + */ 267 + bch2_trans_unlock(trans); 268 + 263 269 /* write path might have to decompress data: */ 264 270 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) 265 271 sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); ··· 512 506 */ 513 507 bch2_bkey_buf_reassemble(&sk, c, k); 514 508 k = bkey_i_to_s_c(sk.k); 509 + bch2_trans_unlock(&trans); 515 510 516 511 ret2 = bch2_move_extent(&trans, &iter, ctxt, io_opts, 517 512 btree_id, k, data_opts);

+15

fs/bcachefs/nocow_locking.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include "bcachefs.h" 4 + #include "nocow_locking.h" 5 + #include "util.h" 6 + 7 + void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, 8 + two_state_lock_t *l, int flags) 9 + { 10 + struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks); 11 + u64 start_time = local_clock(); 12 + 13 + bch2_two_state_lock(l, flags & BUCKET_NOCOW_LOCK_UPDATE); 14 + bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time); 15 + }

+55

fs/bcachefs/nocow_locking.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _BCACHEFS_NOCOW_LOCKING_H 3 + #define _BCACHEFS_NOCOW_LOCKING_H 4 + 5 + #include "bcachefs_format.h" 6 + #include "two_state_shared_lock.h" 7 + 8 + #include <linux/hash.h> 9 + 10 + #define BUCKET_NOCOW_LOCKS_BITS 10 11 + #define BUCKET_NOCOW_LOCKS (1U << BUCKET_NOCOW_LOCKS_BITS) 12 + 13 + struct bucket_nocow_lock_table { 14 + two_state_lock_t l[BUCKET_NOCOW_LOCKS]; 15 + }; 16 + 17 + #define BUCKET_NOCOW_LOCK_UPDATE (1 << 0) 18 + 19 + static inline two_state_lock_t *bucket_nocow_lock(struct bucket_nocow_lock_table *t, 20 + struct bpos bucket) 21 + { 22 + u64 dev_bucket = bucket.inode << 56 | bucket.offset; 23 + unsigned h = hash_64(dev_bucket, BUCKET_NOCOW_LOCKS_BITS); 24 + 25 + return t->l + (h & (BUCKET_NOCOW_LOCKS - 1)); 26 + } 27 + 28 + static inline bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, 29 + struct bpos bucket) 30 + { 31 + two_state_lock_t *l = bucket_nocow_lock(t, bucket); 32 + 33 + return atomic_long_read(&l->v) != 0; 34 + } 35 + 36 + static inline void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, 37 + struct bpos bucket, int flags) 38 + { 39 + two_state_lock_t *l = bucket_nocow_lock(t, bucket); 40 + 41 + bch2_two_state_unlock(l, flags & BUCKET_NOCOW_LOCK_UPDATE); 42 + } 43 + 44 + void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *, two_state_lock_t *, int); 45 + 46 + static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, 47 + struct bpos bucket, int flags) 48 + { 49 + two_state_lock_t *l = bucket_nocow_lock(t, bucket); 50 + 51 + if (!bch2_two_state_trylock(l, flags & BUCKET_NOCOW_LOCK_UPDATE)) 52 + __bch2_bucket_nocow_lock(t, l, flags); 53 + } 54 + 55 + #endif /* _BCACHEFS_NOCOW_LOCKING_H */

fs/bcachefs/opts.h

··· 392 392 OPT_BOOL(), \ 393 393 BCH2_NO_SB_OPT, false, \ 394 394 NULL, NULL) \ 395 + x(nocow, u8, \ 396 + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ 397 + OPT_BOOL(), \ 398 + BCH_SB_NOCOW, false, \ 399 + NULL, "Nocow mode: Writes will be done in place when possible.\n"\ 400 + "Snapshots and reflink will still caused writes to be COW\n"\ 401 + "Implicitly disables data checksumming, compression and encryption")\ 395 402 x(no_data_io, u8, \ 396 403 OPT_MOUNT, \ 397 404 OPT_BOOL(), \

+4 -3

fs/bcachefs/super.h

··· 88 88 static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, 89 89 unsigned dev) 90 90 { 91 - BUG_ON(bch2_dev_list_has_dev(*devs, dev)); 92 - BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs)); 93 - devs->devs[devs->nr++] = dev; 91 + if (!bch2_dev_list_has_dev(*devs, dev)) { 92 + BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs)); 93 + devs->devs[devs->nr++] = dev; 94 + } 94 95 } 95 96 96 97 static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)

+4 -1

fs/bcachefs/trace.h

··· 543 543 __field(u64, need_journal_commit ) 544 544 __field(u64, nouse ) 545 545 __field(bool, nonblocking ) 546 + __field(u64, nocow ) 546 547 __array(char, err, 32 ) 547 548 ), 548 549 ··· 561 560 __entry->need_journal_commit = s->skipped_need_journal_commit; 562 561 __entry->nouse = s->skipped_nouse; 563 562 __entry->nonblocking = nonblocking; 563 + __entry->nocow = s->skipped_nocow; 564 564 strscpy(__entry->err, err, sizeof(__entry->err)); 565 565 ), 566 566 567 - TP_printk("%d,%d reserve %s user %u bucket %llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u err %s", 567 + TP_printk("%d,%d reserve %s user %u bucket %llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s", 568 568 MAJOR(__entry->dev), MINOR(__entry->dev), 569 569 __entry->reserve, 570 570 __entry->user, ··· 578 576 __entry->open, 579 577 __entry->need_journal_commit, 580 578 __entry->nouse, 579 + __entry->nocow, 581 580 __entry->nonblocking, 582 581 __entry->err) 583 582 );