Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bcachefs: Unwritten extents support

- bch2_extent_merge checks unwritten bit
- read path returns 0s for unwritten extents without actually reading
- reflink path skips over unwritten extents
- bch2_bkey_ptrs_invalid() checks for extents with both written and
unwritten extents, and non-normal extents (stripes, btree ptrs) with
unwritten ptrs
- fiemap checks for unwritten extents and returns
FIEMAP_EXTENT_UNWRITTEN

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

+69 -16
+2 -2
fs/bcachefs/bcachefs_format.h
··· 582 582 __u64 type:1, 583 583 cached:1, 584 584 unused:1, 585 - reservation:1, 585 + unwritten:1, 586 586 offset:44, /* 8 petabytes */ 587 587 dev:8, 588 588 gen:8; ··· 590 590 __u64 gen:8, 591 591 dev:8, 592 592 offset:44, 593 - reservation:1, 593 + unwritten:1, 594 594 unused:1, 595 595 cached:1, 596 596 type:1;
+30 -4
fs/bcachefs/extents.c
··· 116 116 return -EIO; 117 117 118 118 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 119 + /* 120 + * Unwritten extent: no need to actually read, treat it as a 121 + * hole and return 0s: 122 + */ 123 + if (p.ptr.unwritten) 124 + return 0; 125 + 119 126 ca = bch_dev_bkey_exists(c, p.ptr.dev); 120 127 121 128 /* ··· 276 269 rp.ptr.offset + rp.crc.offset || 277 270 lp.ptr.dev != rp.ptr.dev || 278 271 lp.ptr.gen != rp.ptr.gen || 272 + lp.ptr.unwritten != rp.ptr.unwritten || 279 273 lp.has_ec != rp.has_ec) 280 274 return false; 281 275 ··· 912 904 const union bch_extent_entry *entry1, *entry2; 913 905 struct extent_ptr_decoded p1, p2; 914 906 907 + if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2)) 908 + return false; 909 + 915 910 bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) 916 911 bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) 917 912 if (p1.ptr.dev == p2.ptr.dev && ··· 992 981 u32 offset; 993 982 u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); 994 983 995 - prt_printf(out, "ptr: %u:%llu:%u gen %u%s", ptr->dev, 996 - b, offset, ptr->gen, 997 - ptr->cached ? " cached" : ""); 998 - 984 + prt_printf(out, "ptr: %u:%llu:%u gen %u", 985 + ptr->dev, b, offset, ptr->gen); 986 + if (ptr->cached) 987 + prt_str(out, " cached"); 988 + if (ptr->unwritten) 989 + prt_str(out, " unwritten"); 999 990 if (ca && ptr_stale(ca, ptr)) 1000 991 prt_printf(out, " stale"); 1001 992 } ··· 1086 1073 unsigned size_ondisk = k.k->size; 1087 1074 unsigned nonce = UINT_MAX; 1088 1075 unsigned nr_ptrs = 0; 1076 + bool unwritten = false; 1089 1077 int ret; 1090 1078 1091 1079 if (bkey_is_btree_ptr(k.k)) ··· 1111 1097 false, err); 1112 1098 if (ret) 1113 1099 return ret; 1100 + 1101 + if (nr_ptrs && unwritten != entry->ptr.unwritten) { 1102 + prt_printf(err, "extent with unwritten and written ptrs"); 1103 + return -BCH_ERR_invalid_bkey; 1104 + } 1105 + 1106 + if (k.k->type != KEY_TYPE_extent && entry->ptr.unwritten) { 1107 + prt_printf(err, "has unwritten ptrs"); 1108 + return -BCH_ERR_invalid_bkey; 1109 + } 1110 + 1111 + unwritten = entry->ptr.unwritten; 1114 1112 nr_ptrs++; 1115 1113 break; 1116 1114 case BCH_EXTENT_ENTRY_crc32:
+17
fs/bcachefs/extents.h
··· 510 510 } 511 511 } 512 512 513 + static inline bool bkey_extent_is_unwritten(struct bkey_s_c k) 514 + { 515 + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 516 + const struct bch_extent_ptr *ptr; 517 + 518 + bkey_for_each_ptr(ptrs, ptr) 519 + if (ptr->unwritten) 520 + return true; 521 + return false; 522 + } 523 + 524 + static inline bool bkey_extent_is_reservation(struct bkey_s_c k) 525 + { 526 + return k.k->type == KEY_TYPE_reservation || 527 + bkey_extent_is_unwritten(k); 528 + } 529 + 513 530 static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) 514 531 { 515 532 struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+7 -7
fs/bcachefs/fs-io.c
··· 341 341 return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp); 342 342 } 343 343 344 - static unsigned bkey_to_sector_state(const struct bkey *k) 344 + static unsigned bkey_to_sector_state(struct bkey_s_c k) 345 345 { 346 - if (k->type == KEY_TYPE_reservation) 346 + if (bkey_extent_is_reservation(k)) 347 347 return SECTOR_RESERVED; 348 - if (bkey_extent_is_allocation(k)) 348 + if (bkey_extent_is_allocation(k.k)) 349 349 return SECTOR_ALLOCATED; 350 350 return SECTOR_UNALLOCATED; 351 351 } ··· 396 396 SPOS(inum.inum, offset, snapshot), 397 397 BTREE_ITER_SLOTS, k, ret) { 398 398 unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); 399 - unsigned state = bkey_to_sector_state(k.k); 399 + unsigned state = bkey_to_sector_state(k); 400 400 401 401 while (pg_idx < nr_pages) { 402 402 struct page *page = pages[pg_idx]; ··· 436 436 struct bio_vec bv; 437 437 unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v 438 438 ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); 439 - unsigned state = bkey_to_sector_state(k.k); 439 + unsigned state = bkey_to_sector_state(k); 440 440 441 441 bio_for_each_segment(bv, bio, iter) 442 442 __bch2_page_state_set(bv.bv_page, bv.bv_offset >> 9, ··· 3093 3093 goto bkey_err; 3094 3094 3095 3095 /* already reserved */ 3096 - if (k.k->type == KEY_TYPE_reservation && 3097 - bkey_s_c_to_reservation(k).v->nr_replicas >= opts.data_replicas) { 3096 + if (bkey_extent_is_reservation(k) && 3097 + bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { 3098 3098 bch2_btree_iter_advance(&iter); 3099 3099 continue; 3100 3100 }
+3
fs/bcachefs/fs.c
··· 811 811 int flags2 = 0; 812 812 u64 offset = p.ptr.offset; 813 813 814 + if (p.ptr.unwritten) 815 + flags2 |= FIEMAP_EXTENT_UNWRITTEN; 816 + 814 817 if (p.crc.compression_type) 815 818 flags2 |= FIEMAP_EXTENT_ENCODED; 816 819 else
+2 -2
fs/bcachefs/fsck.c
··· 1251 1251 continue; 1252 1252 1253 1253 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && 1254 - k.k->type != KEY_TYPE_reservation && 1255 - k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c, 1254 + k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && 1255 + !bkey_extent_is_reservation(k), c, 1256 1256 "extent type past end of inode %llu:%u, i_size %llu\n %s", 1257 1257 i->inode.bi_inum, i->snapshot, i->inode.bi_size, 1258 1258 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+3
fs/bcachefs/io.c
··· 1481 1481 if (bch2_bkey_has_target(c, k, opts.promote_target)) 1482 1482 return false; 1483 1483 1484 + if (bkey_extent_is_unwritten(k)) 1485 + return false; 1486 + 1484 1487 if (bch2_target_congested(c, opts.promote_target)) { 1485 1488 /* XXX trace this */ 1486 1489 return false;
+5 -1
fs/bcachefs/reflink.c
··· 251 251 struct bkey_s_c k; 252 252 int ret; 253 253 254 - for_each_btree_key_upto_continue_norestart(*iter, end, 0, k, ret) 254 + for_each_btree_key_upto_continue_norestart(*iter, end, 0, k, ret) { 255 + if (bkey_extent_is_unwritten(k)) 256 + continue; 257 + 255 258 if (bkey_extent_is_data(k.k)) 256 259 return k; 260 + } 257 261 258 262 if (bkey_ge(iter->pos, end)) 259 263 bch2_btree_iter_set_pos(iter, end);