Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bcachefs: rebalance_work

This adds a new btree, rebalance_work, to eliminate scanning required
for finding extents that need work done on them in the background - i.e.
for the background_target and background_compression options.

rebalance_work is a bitset btree, where a KEY_TYPE_set corresponds to an
extent in the extents or reflink btree at the same pos.

A new extent field is added, bch_extent_rebalance, which indicates that
this extent has work that needs to be done in the background - and which
options to use. This allows per-inode options to be propagated to
indirect extents - at least in some circumstances. In this patch,
changing IO options on a file will not propagate the new options to
indirect extents pointed to by that file.

Updating (setting/clearing) the rebalance_work btree is done by the
extent trigger, which looks at the bch_extent_rebalance field.

Scanning is still requrired after changing IO path options - either just
for a given inode, or for the whole filesystem. We indicate that
scanning is required by adding a KEY_TYPE_cookie key to the
rebalance_work btree: the cookie counter is so that we can detect that
scanning is still required when an option has been flipped mid-way
through an existing scan.

Future possible work:
- Propagate options to indirect extents when being changed
- Add other IO path options - nr_replicas, ec, to rebalance_work so
they can be applied in the background when they change
- Add a counter, for bcachefs fs usage output, showing the pending
amount of rebalance work: we'll probably want to do this after the
disk space accounting rewrite (moving it to a new btree)

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

+613 -329
+1
fs/bcachefs/bcachefs.h
··· 464 464 GC_PHASE_BTREE_snapshot_trees, 465 465 GC_PHASE_BTREE_deleted_inodes, 466 466 GC_PHASE_BTREE_logged_ops, 467 + GC_PHASE_BTREE_rebalance_work, 467 468 468 469 GC_PHASE_PENDING_DELETE, 469 470 };
+12 -22
fs/bcachefs/bcachefs_format.h
··· 613 613 #endif 614 614 }; 615 615 616 - struct bch_extent_reservation { 617 - #if defined(__LITTLE_ENDIAN_BITFIELD) 618 - __u64 type:6, 619 - unused:22, 620 - replicas:4, 621 - generation:32; 622 - #elif defined (__BIG_ENDIAN_BITFIELD) 623 - __u64 generation:32, 624 - replicas:4, 625 - unused:22, 626 - type:6; 627 - #endif 628 - }; 629 - 630 616 struct bch_extent_rebalance { 631 617 #if defined(__LITTLE_ENDIAN_BITFIELD) 632 - __u64 type:7, 633 - unused:33, 634 - compression:8, 618 + __u64 type:6, 619 + unused:34, 620 + compression:8, /* enum bch_compression_opt */ 635 621 target:16; 636 622 #elif defined (__BIG_ENDIAN_BITFIELD) 637 623 __u64 target:16, 638 624 compression:8, 639 - unused:33, 640 - type:7; 625 + unused:34, 626 + type:6; 641 627 #endif 642 628 }; 643 629 ··· 1668 1682 x(snapshot_skiplists, BCH_VERSION(1, 1), \ 1669 1683 BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) \ 1670 1684 x(deleted_inodes, BCH_VERSION(1, 2), \ 1671 - BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) 1685 + BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) \ 1686 + x(rebalance_work, BCH_VERSION(1, 3), \ 1687 + BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) 1672 1688 1673 1689 enum bcachefs_metadata_version { 1674 1690 bcachefs_metadata_version_min = 9, ··· 1681 1693 }; 1682 1694 1683 1695 static const __maybe_unused 1684 - unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor; 1696 + unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work; 1685 1697 1686 1698 #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) 1687 1699 ··· 2294 2306 BIT_ULL(KEY_TYPE_set)) \ 2295 2307 x(logged_ops, 17, 0, \ 2296 2308 BIT_ULL(KEY_TYPE_logged_op_truncate)| \ 2297 - BIT_ULL(KEY_TYPE_logged_op_finsert)) 2309 + BIT_ULL(KEY_TYPE_logged_op_finsert)) \ 2310 + x(rebalance_work, 18, BTREE_ID_SNAPSHOTS, \ 2311 + BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) 2298 2312 2299 2313 enum btree_id { 2300 2314 #define x(name, nr, ...) BTREE_ID_##name = nr,
+10
fs/bcachefs/buckets.c
··· 1536 1536 struct bkey_s_c old, struct bkey_i *new, 1537 1537 unsigned flags) 1538 1538 { 1539 + struct bch_fs *c = trans->c; 1540 + int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) - 1541 + (int) bch2_bkey_needs_rebalance(c, old); 1542 + 1543 + if (mod) { 1544 + int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0); 1545 + if (ret) 1546 + return ret; 1547 + } 1548 + 1539 1549 return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags); 1540 1550 } 1541 1551
+13 -5
fs/bcachefs/compress.c
··· 697 697 return ret; 698 698 } 699 699 700 + void bch2_compression_opt_to_text(struct printbuf *out, u64 v) 701 + { 702 + struct bch_compression_opt opt = bch2_compression_decode(v); 703 + 704 + if (opt.type < BCH_COMPRESSION_OPT_NR) 705 + prt_str(out, bch2_compression_opts[opt.type]); 706 + else 707 + prt_printf(out, "(unknown compression opt %u)", opt.type); 708 + if (opt.level) 709 + prt_printf(out, ":%u", opt.level); 710 + } 711 + 700 712 void bch2_opt_compression_to_text(struct printbuf *out, 701 713 struct bch_fs *c, 702 714 struct bch_sb *sb, 703 715 u64 v) 704 716 { 705 - struct bch_compression_opt opt = bch2_compression_decode(v); 706 - 707 - prt_str(out, bch2_compression_opts[opt.type]); 708 - if (opt.level) 709 - prt_printf(out, ":%u", opt.level); 717 + return bch2_compression_opt_to_text(out, v); 710 718 } 711 719 712 720 int bch2_opt_compression_validate(u64 v, struct printbuf *err)
+2
fs/bcachefs/compress.h
··· 58 58 void bch2_fs_compress_exit(struct bch_fs *); 59 59 int bch2_fs_compress_init(struct bch_fs *); 60 60 61 + void bch2_compression_opt_to_text(struct printbuf *, u64); 62 + 61 63 int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); 62 64 void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); 63 65 int bch2_opt_compression_validate(u64, struct printbuf *);
+6 -5
fs/bcachefs/data_update.c
··· 13 13 #include "keylist.h" 14 14 #include "move.h" 15 15 #include "nocow_locking.h" 16 + #include "rebalance.h" 16 17 #include "subvolume.h" 17 18 #include "trace.h" 18 19 ··· 252 251 ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id, 253 252 k.k->p, bkey_start_pos(&insert->k)) ?: 254 253 bch2_insert_snapshot_whiteouts(trans, m->btree_id, 255 - k.k->p, insert->k.p); 256 - if (ret) 257 - goto err; 258 - 259 - ret = bch2_trans_update(trans, &iter, insert, 254 + k.k->p, insert->k.p) ?: 255 + bch2_bkey_set_needs_rebalance(c, insert, 256 + op->opts.background_target, 257 + op->opts.background_compression) ?: 258 + bch2_trans_update(trans, &iter, insert, 260 259 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 261 260 bch2_trans_commit(trans, &op->res, 262 261 NULL,
+142 -13
fs/bcachefs/extents.c
··· 13 13 #include "btree_iter.h" 14 14 #include "buckets.h" 15 15 #include "checksum.h" 16 + #include "compress.h" 16 17 #include "debug.h" 17 18 #include "disk_groups.h" 18 19 #include "error.h" ··· 758 757 return i; 759 758 } 760 759 761 - static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) 762 - { 763 - union bch_extent_entry *next = extent_entry_next(entry); 764 - 765 - /* stripes have ptrs, but their layout doesn't work with this code */ 766 - BUG_ON(k.k->type == KEY_TYPE_stripe); 767 - 768 - memmove_u64s_down(entry, next, 769 - (u64 *) bkey_val_end(k) - (u64 *) next); 770 - k.k->u64s -= (u64 *) next - (u64 *) entry; 771 - } 772 - 773 760 /* 774 761 * Returns pointer to the next entry after the one being dropped: 775 762 */ ··· 1037 1048 (u64) ec->idx, ec->block); 1038 1049 break; 1039 1050 } 1051 + case BCH_EXTENT_ENTRY_rebalance: { 1052 + const struct bch_extent_rebalance *r = &entry->rebalance; 1053 + 1054 + prt_str(out, "rebalance: target "); 1055 + if (c) 1056 + bch2_target_to_text(out, c, r->target); 1057 + else 1058 + prt_printf(out, "%u", r->target); 1059 + prt_str(out, " compression "); 1060 + bch2_compression_opt_to_text(out, r->compression); 1061 + break; 1062 + } 1040 1063 default: 1041 1064 prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); 1042 1065 return; ··· 1224 1223 } 1225 1224 have_ec = true; 1226 1225 break; 1227 - case BCH_EXTENT_ENTRY_rebalance: 1226 + case BCH_EXTENT_ENTRY_rebalance: { 1227 + const struct bch_extent_rebalance *r = &entry->rebalance; 1228 + 1229 + if (!bch2_compression_opt_valid(r->compression)) { 1230 + struct bch_compression_opt opt = __bch2_compression_decode(r->compression); 1231 + prt_printf(err, "invalid compression opt %u:%u", 1232 + opt.type, opt.level); 1233 + return -BCH_ERR_invalid_bkey; 1234 + } 1228 1235 break; 1236 + } 1229 1237 } 1230 1238 } 1231 1239 ··· 1297 1287 break; 1298 1288 } 1299 1289 } 1290 + } 1291 + 1292 + const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) 1293 + { 1294 + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1295 + const union bch_extent_entry *entry; 1296 + 1297 + bkey_extent_entry_for_each(ptrs, entry) 1298 + if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance) 1299 + return &entry->rebalance; 1300 + 1301 + return NULL; 1302 + } 1303 + 1304 + unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k, 1305 + unsigned target, unsigned compression) 1306 + { 1307 + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1308 + unsigned rewrite_ptrs = 0; 1309 + 1310 + if (compression) { 1311 + unsigned compression_type = bch2_compression_opt_to_type(compression); 1312 + const union bch_extent_entry *entry; 1313 + struct extent_ptr_decoded p; 1314 + unsigned i = 0; 1315 + 1316 + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 1317 + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) { 1318 + rewrite_ptrs = 0; 1319 + goto incompressible; 1320 + } 1321 + 1322 + if (!p.ptr.cached && p.crc.compression_type != compression_type) 1323 + rewrite_ptrs |= 1U << i; 1324 + i++; 1325 + } 1326 + } 1327 + incompressible: 1328 + if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { 1329 + const struct bch_extent_ptr *ptr; 1330 + unsigned i = 0; 1331 + 1332 + bkey_for_each_ptr(ptrs, ptr) { 1333 + if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target)) 1334 + rewrite_ptrs |= 1U << i; 1335 + i++; 1336 + } 1337 + } 1338 + 1339 + return rewrite_ptrs; 1340 + } 1341 + 1342 + bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k) 1343 + { 1344 + const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); 1345 + 1346 + /* 1347 + * If it's an indirect extent, we don't delete the rebalance entry when 1348 + * done so that we know what options were applied - check if it still 1349 + * needs work done: 1350 + */ 1351 + if (r && 1352 + k.k->type == KEY_TYPE_reflink_v && 1353 + !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression)) 1354 + r = NULL; 1355 + 1356 + return r != NULL; 1357 + } 1358 + 1359 + int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, 1360 + unsigned target, unsigned compression) 1361 + { 1362 + struct bkey_s k = bkey_i_to_s(_k); 1363 + struct bch_extent_rebalance *r; 1364 + bool needs_rebalance; 1365 + 1366 + if (!bkey_extent_is_direct_data(k.k)) 1367 + return 0; 1368 + 1369 + /* get existing rebalance entry: */ 1370 + r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); 1371 + if (r) { 1372 + if (k.k->type == KEY_TYPE_reflink_v) { 1373 + /* 1374 + * indirect extents: existing options take precedence, 1375 + * so that we don't move extents back and forth if 1376 + * they're referenced by different inodes with different 1377 + * options: 1378 + */ 1379 + if (r->target) 1380 + target = r->target; 1381 + if (r->compression) 1382 + compression = r->compression; 1383 + } 1384 + 1385 + r->target = target; 1386 + r->compression = compression; 1387 + } 1388 + 1389 + needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression); 1390 + 1391 + if (needs_rebalance && !r) { 1392 + union bch_extent_entry *new = bkey_val_end(k); 1393 + 1394 + new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance; 1395 + new->rebalance.compression = compression; 1396 + new->rebalance.target = target; 1397 + new->rebalance.unused = 0; 1398 + k.k->u64s += extent_entry_u64s(new); 1399 + } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) { 1400 + /* 1401 + * For indirect extents, don't delete the rebalance entry when 1402 + * we're finished so that we know we specifically moved it or 1403 + * compressed it to its current location/compression type 1404 + */ 1405 + extent_entry_drop(k, (union bch_extent_entry *) r); 1406 + } 1407 + 1408 + return 0; 1300 1409 } 1301 1410 1302 1411 /* Generic extent code: */
+20
fs/bcachefs/extents.h
··· 89 89 memcpy_u64s_small(dst, new, extent_entry_u64s(new)); 90 90 } 91 91 92 + static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) 93 + { 94 + union bch_extent_entry *next = extent_entry_next(entry); 95 + 96 + /* stripes have ptrs, but their layout doesn't work with this code */ 97 + BUG_ON(k.k->type == KEY_TYPE_stripe); 98 + 99 + memmove_u64s_down(entry, next, 100 + (u64 *) bkey_val_end(k) - (u64 *) next); 101 + k.k->u64s -= (u64 *) next - (u64 *) entry; 102 + } 103 + 92 104 static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) 93 105 { 94 106 return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; ··· 709 697 enum bkey_invalid_flags, struct printbuf *); 710 698 711 699 void bch2_ptr_swab(struct bkey_s); 700 + 701 + const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); 702 + unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, 703 + unsigned, unsigned); 704 + bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c); 705 + 706 + int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *, 707 + unsigned, unsigned); 712 708 713 709 /* Generic extent code: */ 714 710
+10 -1
fs/bcachefs/io_misc.c
··· 16 16 #include "io_misc.h" 17 17 #include "io_write.h" 18 18 #include "logged_ops.h" 19 + #include "rebalance.h" 19 20 #include "subvolume.h" 20 21 21 22 /* Overwrites whatever was present with zeroes: */ ··· 356 355 struct btree_iter iter; 357 356 struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k); 358 357 subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; 358 + struct bch_io_opts opts; 359 359 u64 dst_offset = le64_to_cpu(op->v.dst_offset); 360 360 u64 src_offset = le64_to_cpu(op->v.src_offset); 361 361 s64 shift = dst_offset - src_offset; ··· 364 362 u64 pos = le64_to_cpu(op->v.pos); 365 363 bool insert = shift > 0; 366 364 int ret = 0; 365 + 366 + ret = bch2_inum_opts_get(trans, inum, &opts); 367 + if (ret) 368 + return ret; 367 369 368 370 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 369 371 POS(inum.inum, 0), ··· 449 443 450 444 op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); 451 445 452 - ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: 446 + ret = bch2_bkey_set_needs_rebalance(c, copy, 447 + opts.background_target, 448 + opts.background_compression) ?: 449 + bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: 453 450 bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: 454 451 bch2_logged_op_update(trans, &op->k_i) ?: 455 452 bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL);
+7 -13
fs/bcachefs/io_write.c
··· 351 351 bkey_start_pos(&sk.k->k), 352 352 BTREE_ITER_SLOTS|BTREE_ITER_INTENT); 353 353 354 - ret = bch2_extent_update(trans, inum, &iter, sk.k, 355 - &op->res, 356 - op->new_i_size, &op->i_sectors_delta, 357 - op->flags & BCH_WRITE_CHECK_ENOSPC); 354 + ret = bch2_bkey_set_needs_rebalance(c, sk.k, 355 + op->opts.background_target, 356 + op->opts.background_compression) ?: 357 + bch2_extent_update(trans, inum, &iter, sk.k, 358 + &op->res, 359 + op->new_i_size, &op->i_sectors_delta, 360 + op->flags & BCH_WRITE_CHECK_ENOSPC); 358 361 bch2_trans_iter_exit(trans, &iter); 359 362 360 363 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ··· 498 495 { 499 496 struct bch_fs *c = op->c; 500 497 struct keylist *keys = &op->insert_keys; 501 - struct bkey_i *k; 502 498 unsigned dev; 503 499 int ret = 0; 504 500 ··· 506 504 if (ret) 507 505 goto err; 508 506 } 509 - 510 - /* 511 - * probably not the ideal place to hook this in, but I don't 512 - * particularly want to plumb io_opts all the way through the btree 513 - * update stack right now 514 - */ 515 - for_each_keylist_key(keys, k) 516 - bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); 517 507 518 508 if (!bch2_keylist_empty(keys)) { 519 509 u64 sectors_start = keylist_sectors(keys);
+341 -240
fs/bcachefs/rebalance.c
··· 3 3 #include "bcachefs.h" 4 4 #include "alloc_foreground.h" 5 5 #include "btree_iter.h" 6 + #include "btree_update.h" 7 + #include "btree_write_buffer.h" 6 8 #include "buckets.h" 7 9 #include "clock.h" 8 10 #include "compress.h" 9 11 #include "disk_groups.h" 10 12 #include "errcode.h" 13 + #include "error.h" 14 + #include "inode.h" 11 15 #include "move.h" 12 16 #include "rebalance.h" 17 + #include "subvolume.h" 13 18 #include "super-io.h" 14 19 #include "trace.h" 15 20 ··· 22 17 #include <linux/kthread.h> 23 18 #include <linux/sched/cputime.h> 24 19 25 - /* 26 - * Check if an extent should be moved: 27 - * returns -1 if it should not be moved, or 28 - * device of pointer that should be moved, if known, or INT_MAX if unknown 29 - */ 20 + #define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) 21 + 22 + static const char * const bch2_rebalance_state_strs[] = { 23 + #define x(t) #t, 24 + BCH_REBALANCE_STATES() 25 + NULL 26 + #undef x 27 + }; 28 + 29 + static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum) 30 + { 31 + struct btree_iter iter; 32 + struct bkey_s_c k; 33 + struct bkey_i_cookie *cookie; 34 + u64 v; 35 + int ret; 36 + 37 + bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, 38 + SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), 39 + BTREE_ITER_INTENT); 40 + k = bch2_btree_iter_peek_slot(&iter); 41 + ret = bkey_err(k); 42 + if (ret) 43 + goto err; 44 + 45 + v = k.k->type == KEY_TYPE_cookie 46 + ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) 47 + : 0; 48 + 49 + cookie = bch2_trans_kmalloc(trans, sizeof(*cookie)); 50 + ret = PTR_ERR_OR_ZERO(cookie); 51 + if (ret) 52 + goto err; 53 + 54 + bkey_cookie_init(&cookie->k_i); 55 + cookie->k.p = iter.pos; 56 + cookie->v.cookie = cpu_to_le64(v + 1); 57 + 58 + ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0); 59 + err: 60 + bch2_trans_iter_exit(trans, &iter); 61 + return ret; 62 + } 63 + 64 + int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) 65 + { 66 + int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, 67 + __bch2_set_rebalance_needs_scan(trans, inum)); 68 + rebalance_wakeup(c); 69 + return ret; 70 + } 71 + 72 + int bch2_set_fs_needs_rebalance(struct bch_fs *c) 73 + { 74 + return bch2_set_rebalance_needs_scan(c, 0); 75 + } 76 + 77 + static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie) 78 + { 79 + struct btree_iter iter; 80 + struct bkey_s_c k; 81 + u64 v; 82 + int ret; 83 + 84 + bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, 85 + SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), 86 + BTREE_ITER_INTENT); 87 + k = bch2_btree_iter_peek_slot(&iter); 88 + ret = bkey_err(k); 89 + if (ret) 90 + goto err; 91 + 92 + v = k.k->type == KEY_TYPE_cookie 93 + ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) 94 + : 0; 95 + 96 + if (v == cookie) 97 + ret = bch2_btree_delete_at(trans, &iter, 0); 98 + err: 99 + bch2_trans_iter_exit(trans, &iter); 100 + return ret; 101 + } 102 + 103 + static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans, 104 + struct btree_iter *work_iter) 105 + { 106 + return !kthread_should_stop() 107 + ? bch2_btree_iter_peek(work_iter) 108 + : bkey_s_c_null; 109 + } 110 + 111 + static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, 112 + struct btree_iter *iter, 113 + struct bkey_s_c k) 114 + { 115 + struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); 116 + int ret = PTR_ERR_OR_ZERO(n); 117 + if (ret) 118 + return ret; 119 + 120 + extent_entry_drop(bkey_i_to_s(n), 121 + (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n))); 122 + return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); 123 + } 124 + 125 + static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, 126 + struct bpos work_pos, 127 + struct btree_iter *extent_iter, 128 + struct data_update_opts *data_opts) 129 + { 130 + struct bch_fs *c = trans->c; 131 + struct bkey_s_c k; 132 + 133 + bch2_trans_iter_exit(trans, extent_iter); 134 + bch2_trans_iter_init(trans, extent_iter, 135 + work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, 136 + work_pos, 137 + BTREE_ITER_ALL_SNAPSHOTS); 138 + k = bch2_btree_iter_peek_slot(extent_iter); 139 + if (bkey_err(k)) 140 + return k; 141 + 142 + const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL; 143 + if (!r) { 144 + /* raced due to btree write buffer, nothing to do */ 145 + return bkey_s_c_null; 146 + } 147 + 148 + memset(data_opts, 0, sizeof(*data_opts)); 149 + 150 + data_opts->rewrite_ptrs = 151 + bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression); 152 + data_opts->target = r->target; 153 + 154 + if (!data_opts->rewrite_ptrs) { 155 + /* 156 + * device we would want to write to offline? devices in target 157 + * changed? 158 + * 159 + * We'll now need a full scan before this extent is picked up 160 + * again: 161 + */ 162 + int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k); 163 + if (ret) 164 + return bkey_s_c_err(ret); 165 + return bkey_s_c_null; 166 + } 167 + 168 + return k; 169 + } 170 + 171 + noinline_for_stack 172 + static int do_rebalance_extent(struct moving_context *ctxt, 173 + struct bpos work_pos, 174 + struct btree_iter *extent_iter) 175 + { 176 + struct btree_trans *trans = ctxt->trans; 177 + struct bch_fs *c = trans->c; 178 + struct bch_fs_rebalance *r = &trans->c->rebalance; 179 + struct data_update_opts data_opts; 180 + struct bch_io_opts io_opts; 181 + struct bkey_s_c k; 182 + struct bkey_buf sk; 183 + int ret; 184 + 185 + ctxt->stats = &r->work_stats; 186 + r->state = BCH_REBALANCE_working; 187 + 188 + bch2_bkey_buf_init(&sk); 189 + 190 + ret = bkey_err(k = next_rebalance_extent(trans, work_pos, 191 + extent_iter, &data_opts)); 192 + if (ret || !k.k) 193 + goto out; 194 + 195 + ret = bch2_move_get_io_opts_one(trans, &io_opts, k); 196 + if (ret) 197 + goto out; 198 + 199 + atomic64_add(k.k->size, &ctxt->stats->sectors_seen); 200 + 201 + /* 202 + * The iterator gets unlocked by __bch2_read_extent - need to 203 + * save a copy of @k elsewhere: 204 + */ 205 + bch2_bkey_buf_reassemble(&sk, c, k); 206 + k = bkey_i_to_s_c(sk.k); 207 + 208 + ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts); 209 + if (ret) { 210 + if (bch2_err_matches(ret, ENOMEM)) { 211 + /* memory allocation failure, wait for some IO to finish */ 212 + bch2_move_ctxt_wait_for_io(ctxt); 213 + ret = -BCH_ERR_transaction_restart_nested; 214 + } 215 + 216 + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 217 + goto out; 218 + 219 + /* skip it and continue, XXX signal failure */ 220 + ret = 0; 221 + } 222 + out: 223 + bch2_bkey_buf_exit(&sk, c); 224 + return ret; 225 + } 226 + 30 227 static bool rebalance_pred(struct bch_fs *c, void *arg, 31 228 struct bkey_s_c k, 32 229 struct bch_io_opts *io_opts, 33 230 struct data_update_opts *data_opts) 34 231 { 35 - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 36 - unsigned i; 232 + unsigned target, compression; 37 233 38 - data_opts->rewrite_ptrs = 0; 39 - data_opts->target = io_opts->background_target; 40 - data_opts->extra_replicas = 0; 41 - data_opts->btree_insert_flags = 0; 234 + if (k.k->p.inode) { 235 + target = io_opts->background_target; 236 + compression = io_opts->background_compression ?: io_opts->compression; 237 + } else { 238 + const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); 42 239 43 - if (io_opts->background_compression && 44 - !bch2_bkey_is_incompressible(k)) { 45 - const union bch_extent_entry *entry; 46 - struct extent_ptr_decoded p; 47 - 48 - i = 0; 49 - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 50 - if (!p.ptr.cached && 51 - p.crc.compression_type != 52 - bch2_compression_opt_to_type(io_opts->background_compression)) 53 - data_opts->rewrite_ptrs |= 1U << i; 54 - i++; 55 - } 240 + target = r ? r->target : io_opts->background_target; 241 + compression = r ? r->compression : 242 + (io_opts->background_compression ?: io_opts->compression); 56 243 } 57 244 58 - if (io_opts->background_target) { 59 - const struct bch_extent_ptr *ptr; 60 - 61 - i = 0; 62 - bkey_for_each_ptr(ptrs, ptr) { 63 - if (!ptr->cached && 64 - !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) && 65 - bch2_target_accepts_data(c, BCH_DATA_user, io_opts->background_target)) 66 - data_opts->rewrite_ptrs |= 1U << i; 67 - i++; 68 - } 69 - } 70 - 245 + data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); 246 + data_opts->target = target; 71 247 return data_opts->rewrite_ptrs != 0; 72 248 } 73 249 74 - void bch2_rebalance_add_key(struct bch_fs *c, 75 - struct bkey_s_c k, 76 - struct bch_io_opts *io_opts) 250 + static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) 77 251 { 78 - struct data_update_opts update_opts = { 0 }; 79 - struct bkey_ptrs_c ptrs; 80 - const struct bch_extent_ptr *ptr; 81 - unsigned i; 252 + struct btree_trans *trans = ctxt->trans; 253 + struct bch_fs_rebalance *r = &trans->c->rebalance; 254 + int ret; 82 255 83 - if (!rebalance_pred(c, NULL, k, io_opts, &update_opts)) 84 - return; 256 + bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); 257 + ctxt->stats = &r->scan_stats; 85 258 86 - i = 0; 87 - ptrs = bch2_bkey_ptrs_c(k); 88 - bkey_for_each_ptr(ptrs, ptr) { 89 - if ((1U << i) && update_opts.rewrite_ptrs) 90 - if (atomic64_add_return(k.k->size, 91 - &bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) == 92 - k.k->size) 93 - rebalance_wakeup(c); 94 - i++; 95 - } 96 - } 97 - 98 - void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) 99 - { 100 - if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == 101 - sectors) 102 - rebalance_wakeup(c); 103 - } 104 - 105 - struct rebalance_work { 106 - int dev_most_full_idx; 107 - unsigned dev_most_full_percent; 108 - u64 dev_most_full_work; 109 - u64 dev_most_full_capacity; 110 - u64 total_work; 111 - }; 112 - 113 - static void rebalance_work_accumulate(struct rebalance_work *w, 114 - u64 dev_work, u64 unknown_dev, u64 capacity, int idx) 115 - { 116 - unsigned percent_full; 117 - u64 work = dev_work + unknown_dev; 118 - 119 - /* avoid divide by 0 */ 120 - if (!capacity) 121 - return; 122 - 123 - if (work < dev_work || work < unknown_dev) 124 - work = U64_MAX; 125 - work = min(work, capacity); 126 - 127 - percent_full = div64_u64(work * 100, capacity); 128 - 129 - if (percent_full >= w->dev_most_full_percent) { 130 - w->dev_most_full_idx = idx; 131 - w->dev_most_full_percent = percent_full; 132 - w->dev_most_full_work = work; 133 - w->dev_most_full_capacity = capacity; 259 + if (!inum) { 260 + r->scan_start = BBPOS_MIN; 261 + r->scan_end = BBPOS_MAX; 262 + } else { 263 + r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0)); 264 + r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX)); 134 265 } 135 266 136 - if (w->total_work + dev_work >= w->total_work && 137 - w->total_work + dev_work >= dev_work) 138 - w->total_work += dev_work; 139 - } 267 + r->state = BCH_REBALANCE_scanning; 140 268 141 - static struct rebalance_work rebalance_work(struct bch_fs *c) 142 - { 143 - struct bch_dev *ca; 144 - struct rebalance_work ret = { .dev_most_full_idx = -1 }; 145 - u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev); 146 - unsigned i; 269 + ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?: 270 + commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, 271 + bch2_clear_rebalance_needs_scan(trans, inum, cookie)); 147 272 148 - for_each_online_member(ca, c, i) 149 - rebalance_work_accumulate(&ret, 150 - atomic64_read(&ca->rebalance_work), 151 - unknown_dev, 152 - bucket_to_sector(ca, ca->mi.nbuckets - 153 - ca->mi.first_bucket), 154 - i); 155 - 156 - rebalance_work_accumulate(&ret, 157 - unknown_dev, 0, c->capacity, -1); 158 - 273 + bch2_move_stats_exit(&r->scan_stats, trans->c); 159 274 return ret; 160 275 } 161 276 162 - static void rebalance_work_reset(struct bch_fs *c) 277 + static void rebalance_wait(struct bch_fs *c) 163 278 { 279 + struct bch_fs_rebalance *r = &c->rebalance; 164 280 struct bch_dev *ca; 281 + struct io_clock *clock = &c->io_clock[WRITE]; 282 + u64 now = atomic64_read(&clock->now); 283 + u64 min_member_capacity = 128 * 2048; 165 284 unsigned i; 166 285 167 - for_each_online_member(ca, c, i) 168 - atomic64_set(&ca->rebalance_work, 0); 286 + for_each_rw_member(ca, c, i) 287 + min_member_capacity = min(min_member_capacity, 288 + ca->mi.nbuckets * ca->mi.bucket_size); 169 289 170 - atomic64_set(&c->rebalance.work_unknown_dev, 0); 290 + r->wait_iotime_end = now + (min_member_capacity >> 6); 291 + 292 + if (r->state != BCH_REBALANCE_waiting) { 293 + r->wait_iotime_start = now; 294 + r->wait_wallclock_start = ktime_get_real_ns(); 295 + r->state = BCH_REBALANCE_waiting; 296 + } 297 + 298 + bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); 171 299 } 172 300 173 - static unsigned long curr_cputime(void) 301 + static int do_rebalance(struct moving_context *ctxt) 174 302 { 175 - u64 utime, stime; 303 + struct btree_trans *trans = ctxt->trans; 304 + struct bch_fs *c = trans->c; 305 + struct bch_fs_rebalance *r = &c->rebalance; 306 + struct btree_iter rebalance_work_iter, extent_iter = { NULL }; 307 + struct bkey_s_c k; 308 + int ret = 0; 176 309 177 - task_cputime_adjusted(current, &utime, &stime); 178 - return nsecs_to_jiffies(utime + stime); 310 + bch2_move_stats_init(&r->work_stats, "rebalance_work"); 311 + bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); 312 + 313 + bch2_trans_iter_init(trans, &rebalance_work_iter, 314 + BTREE_ID_rebalance_work, POS_MIN, 315 + BTREE_ITER_ALL_SNAPSHOTS); 316 + 317 + while (!bch2_move_ratelimit(ctxt) && 318 + !kthread_wait_freezable(r->enabled)) { 319 + bch2_trans_begin(trans); 320 + 321 + ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter)); 322 + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 323 + continue; 324 + if (ret || !k.k) 325 + break; 326 + 327 + ret = k.k->type == KEY_TYPE_cookie 328 + ? do_rebalance_scan(ctxt, k.k->p.inode, 329 + le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)) 330 + : do_rebalance_extent(ctxt, k.k->p, &extent_iter); 331 + 332 + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 333 + continue; 334 + if (ret) 335 + break; 336 + 337 + bch2_btree_iter_advance(&rebalance_work_iter); 338 + } 339 + 340 + bch2_trans_iter_exit(trans, &extent_iter); 341 + bch2_trans_iter_exit(trans, &rebalance_work_iter); 342 + bch2_move_stats_exit(&r->scan_stats, c); 343 + 344 + if (!ret && 345 + !kthread_should_stop() && 346 + !atomic64_read(&r->work_stats.sectors_seen) && 347 + !atomic64_read(&r->scan_stats.sectors_seen)) { 348 + bch2_trans_unlock(trans); 349 + rebalance_wait(c); 350 + } 351 + 352 + if (!bch2_err_matches(ret, EROFS)) 353 + bch_err_fn(c, ret); 354 + return ret; 179 355 } 180 356 181 357 static int bch2_rebalance_thread(void *arg) 182 358 { 183 359 struct bch_fs *c = arg; 184 360 struct bch_fs_rebalance *r = &c->rebalance; 185 - struct io_clock *clock = &c->io_clock[WRITE]; 186 - struct rebalance_work w, p; 187 - struct bch_move_stats move_stats; 188 - unsigned long start, prev_start; 189 - unsigned long prev_run_time, prev_run_cputime; 190 - unsigned long cputime, prev_cputime; 191 - u64 io_start; 192 - long throttle; 361 + struct moving_context ctxt; 362 + int ret; 193 363 194 364 set_freezable(); 195 365 196 - io_start = atomic64_read(&clock->now); 197 - p = rebalance_work(c); 198 - prev_start = jiffies; 199 - prev_cputime = curr_cputime(); 366 + bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats, 367 + writepoint_ptr(&c->rebalance_write_point), 368 + true); 200 369 201 - bch2_move_stats_init(&move_stats, "rebalance"); 202 - while (!kthread_wait_freezable(r->enabled)) { 203 - cond_resched(); 370 + while (!kthread_should_stop() && 371 + !(ret = do_rebalance(&ctxt))) 372 + ; 204 373 205 - start = jiffies; 206 - cputime = curr_cputime(); 207 - 208 - prev_run_time = start - prev_start; 209 - prev_run_cputime = cputime - prev_cputime; 210 - 211 - w = rebalance_work(c); 212 - BUG_ON(!w.dev_most_full_capacity); 213 - 214 - if (!w.total_work) { 215 - r->state = REBALANCE_WAITING; 216 - kthread_wait_freezable(rebalance_work(c).total_work); 217 - continue; 218 - } 219 - 220 - /* 221 - * If there isn't much work to do, throttle cpu usage: 222 - */ 223 - throttle = prev_run_cputime * 100 / 224 - max(1U, w.dev_most_full_percent) - 225 - prev_run_time; 226 - 227 - if (w.dev_most_full_percent < 20 && throttle > 0) { 228 - r->throttled_until_iotime = io_start + 229 - div_u64(w.dev_most_full_capacity * 230 - (20 - w.dev_most_full_percent), 231 - 50); 232 - 233 - if (atomic64_read(&clock->now) + clock->max_slop < 234 - r->throttled_until_iotime) { 235 - r->throttled_until_cputime = start + throttle; 236 - r->state = REBALANCE_THROTTLED; 237 - 238 - bch2_kthread_io_clock_wait(clock, 239 - r->throttled_until_iotime, 240 - throttle); 241 - continue; 242 - } 243 - } 244 - 245 - /* minimum 1 mb/sec: */ 246 - r->pd.rate.rate = 247 - max_t(u64, 1 << 11, 248 - r->pd.rate.rate * 249 - max(p.dev_most_full_percent, 1U) / 250 - max(w.dev_most_full_percent, 1U)); 251 - 252 - io_start = atomic64_read(&clock->now); 253 - p = w; 254 - prev_start = start; 255 - prev_cputime = cputime; 256 - 257 - r->state = REBALANCE_RUNNING; 258 - memset(&move_stats, 0, sizeof(move_stats)); 259 - rebalance_work_reset(c); 260 - 261 - bch2_move_data(c, 262 - BBPOS_MIN, BBPOS_MAX, 263 - /* ratelimiting disabled for now */ 264 - NULL, /* &r->pd.rate, */ 265 - &move_stats, 266 - writepoint_ptr(&c->rebalance_write_point), 267 - true, 268 - rebalance_pred, NULL); 269 - } 374 + bch2_moving_ctxt_exit(&ctxt); 270 375 271 376 return 0; 272 377 } 273 378 274 - void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) 379 + void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) 275 380 { 276 381 struct bch_fs_rebalance *r = &c->rebalance; 277 - struct rebalance_work w = rebalance_work(c); 278 382 279 - if (!out->nr_tabstops) 280 - printbuf_tabstop_push(out, 20); 281 - 282 - prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx); 283 - prt_tab(out); 284 - 285 - prt_human_readable_u64(out, w.dev_most_full_work << 9); 286 - prt_printf(out, "/"); 287 - prt_human_readable_u64(out, w.dev_most_full_capacity << 9); 383 + prt_str(out, bch2_rebalance_state_strs[r->state]); 288 384 prt_newline(out); 289 - 290 - prt_printf(out, "total work:"); 291 - prt_tab(out); 292 - 293 - prt_human_readable_u64(out, w.total_work << 9); 294 - prt_printf(out, "/"); 295 - prt_human_readable_u64(out, c->capacity << 9); 296 - prt_newline(out); 297 - 298 - prt_printf(out, "rate:"); 299 - prt_tab(out); 300 - prt_printf(out, "%u", r->pd.rate.rate); 301 - prt_newline(out); 385 + printbuf_indent_add(out, 2); 302 386 303 387 switch (r->state) { 304 - case REBALANCE_WAITING: 305 - prt_printf(out, "waiting"); 388 + case BCH_REBALANCE_waiting: { 389 + u64 now = atomic64_read(&c->io_clock[WRITE].now); 390 + 391 + prt_str(out, "io wait duration: "); 392 + bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start); 393 + prt_newline(out); 394 + 395 + prt_str(out, "io wait remaining: "); 396 + bch2_prt_human_readable_s64(out, r->wait_iotime_end - now); 397 + prt_newline(out); 398 + 399 + prt_str(out, "duration waited: "); 400 + bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start); 401 + prt_newline(out); 306 402 break; 307 - case REBALANCE_THROTTLED: 308 - prt_printf(out, "throttled for %lu sec or ", 309 - (r->throttled_until_cputime - jiffies) / HZ); 310 - prt_human_readable_u64(out, 311 - (r->throttled_until_iotime - 312 - atomic64_read(&c->io_clock[WRITE].now)) << 9); 313 - prt_printf(out, " io"); 403 + } 404 + case BCH_REBALANCE_working: 405 + bch2_move_stats_to_text(out, &r->work_stats); 314 406 break; 315 - case REBALANCE_RUNNING: 316 - prt_printf(out, "running"); 407 + case BCH_REBALANCE_scanning: 408 + bch2_move_stats_to_text(out, &r->scan_stats); 317 409 break; 318 410 } 319 411 prt_newline(out); 412 + printbuf_indent_sub(out, 2); 320 413 } 321 414 322 415 void bch2_rebalance_stop(struct bch_fs *c) ··· 463 360 void bch2_fs_rebalance_init(struct bch_fs *c) 464 361 { 465 362 bch2_pd_controller_init(&c->rebalance.pd); 466 - 467 - atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX); 468 363 }
+4 -5
fs/bcachefs/rebalance.h
··· 4 4 5 5 #include "rebalance_types.h" 6 6 7 + int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); 8 + int bch2_set_fs_needs_rebalance(struct bch_fs *); 9 + 7 10 static inline void rebalance_wakeup(struct bch_fs *c) 8 11 { 9 12 struct task_struct *p; ··· 18 15 rcu_read_unlock(); 19 16 } 20 17 21 - void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c, 22 - struct bch_io_opts *); 23 - void bch2_rebalance_add_work(struct bch_fs *, u64); 24 - 25 - void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *); 18 + void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *); 26 19 27 20 void bch2_rebalance_stop(struct bch_fs *); 28 21 int bch2_rebalance_start(struct bch_fs *);
+21 -10
fs/bcachefs/rebalance_types.h
··· 2 2 #ifndef _BCACHEFS_REBALANCE_TYPES_H 3 3 #define _BCACHEFS_REBALANCE_TYPES_H 4 4 5 + #include "bbpos_types.h" 5 6 #include "move_types.h" 6 7 7 - enum rebalance_state { 8 - REBALANCE_WAITING, 9 - REBALANCE_THROTTLED, 10 - REBALANCE_RUNNING, 8 + #define BCH_REBALANCE_STATES() \ 9 + x(waiting) \ 10 + x(working) \ 11 + x(scanning) 12 + 13 + enum bch_rebalance_states { 14 + #define x(t) BCH_REBALANCE_##t, 15 + BCH_REBALANCE_STATES() 16 + #undef x 11 17 }; 12 18 13 19 struct bch_fs_rebalance { 14 - struct task_struct __rcu *thread; 20 + struct task_struct __rcu *thread; 15 21 struct bch_pd_controller pd; 16 22 17 - atomic64_t work_unknown_dev; 23 + enum bch_rebalance_states state; 24 + u64 wait_iotime_start; 25 + u64 wait_iotime_end; 26 + u64 wait_wallclock_start; 18 27 19 - enum rebalance_state state; 20 - u64 throttled_until_iotime; 21 - unsigned long throttled_until_cputime; 28 + struct bch_move_stats work_stats; 22 29 23 - unsigned enabled:1; 30 + struct bbpos scan_start; 31 + struct bbpos scan_end; 32 + struct bch_move_stats scan_stats; 33 + 34 + unsigned enabled:1; 24 35 }; 25 36 26 37 #endif /* _BCACHEFS_REBALANCE_TYPES_H */
+1
fs/bcachefs/recovery.c
··· 23 23 #include "logged_ops.h" 24 24 #include "move.h" 25 25 #include "quota.h" 26 + #include "rebalance.h" 26 27 #include "recovery.h" 27 28 #include "replicas.h" 28 29 #include "sb-clean.h"
+1
fs/bcachefs/recovery_types.h
··· 42 42 x(check_nlinks, PASS_FSCK) \ 43 43 x(delete_dead_inodes, PASS_FSCK|PASS_UNCLEAN) \ 44 44 x(fix_reflink_p, 0) \ 45 + x(set_fs_needs_rebalance, 0) \ 45 46 46 47 enum bch_recovery_pass { 47 48 #define x(n, when) BCH_RECOVERY_PASS_##n,
+15 -6
fs/bcachefs/reflink.c
··· 7 7 #include "inode.h" 8 8 #include "io_misc.h" 9 9 #include "io_write.h" 10 + #include "rebalance.h" 10 11 #include "reflink.h" 11 12 #include "subvolume.h" 12 13 #include "super-io.h" ··· 253 252 struct bpos dst_start = POS(dst_inum.inum, dst_offset); 254 253 struct bpos src_start = POS(src_inum.inum, src_offset); 255 254 struct bpos dst_end = dst_start, src_end = src_start; 255 + struct bch_io_opts opts; 256 256 struct bpos src_want; 257 - u64 dst_done; 257 + u64 dst_done = 0; 258 258 u32 dst_snapshot, src_snapshot; 259 259 int ret = 0, ret2 = 0; 260 260 ··· 270 268 bch2_bkey_buf_init(&new_dst); 271 269 bch2_bkey_buf_init(&new_src); 272 270 trans = bch2_trans_get(c); 271 + 272 + ret = bch2_inum_opts_get(trans, src_inum, &opts); 273 + if (ret) 274 + goto err; 273 275 274 276 bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start, 275 277 BTREE_ITER_INTENT); ··· 358 352 min(src_k.k->p.offset - src_want.offset, 359 353 dst_end.offset - dst_iter.pos.offset)); 360 354 361 - ret = bch2_extent_update(trans, dst_inum, &dst_iter, 362 - new_dst.k, &disk_res, 363 - new_i_size, i_sectors_delta, 364 - true); 355 + ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, 356 + opts.background_target, 357 + opts.background_compression) ?: 358 + bch2_extent_update(trans, dst_inum, &dst_iter, 359 + new_dst.k, &disk_res, 360 + new_i_size, i_sectors_delta, 361 + true); 365 362 bch2_disk_reservation_put(c, &disk_res); 366 363 } 367 364 bch2_trans_iter_exit(trans, &dst_iter); ··· 395 386 396 387 bch2_trans_iter_exit(trans, &inode_iter); 397 388 } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart)); 398 - 389 + err: 399 390 bch2_trans_put(trans); 400 391 bch2_bkey_buf_exit(&new_src, c); 401 392 bch2_bkey_buf_exit(&new_dst, c);
+6 -8
fs/bcachefs/sysfs.c
··· 212 212 213 213 rw_attribute(rebalance_enabled); 214 214 sysfs_pd_controller_attribute(rebalance); 215 - read_attribute(rebalance_work); 215 + read_attribute(rebalance_status); 216 216 rw_attribute(promote_whole_extents); 217 217 218 218 read_attribute(new_stripes); ··· 386 386 if (attr == &sysfs_copy_gc_wait) 387 387 bch2_copygc_wait_to_text(out, c); 388 388 389 - if (attr == &sysfs_rebalance_work) 390 - bch2_rebalance_work_to_text(out, c); 389 + if (attr == &sysfs_rebalance_status) 390 + bch2_rebalance_status_to_text(out, c); 391 391 392 392 sysfs_print(promote_whole_extents, c->promote_whole_extents); 393 393 ··· 646 646 &sysfs_copy_gc_wait, 647 647 648 648 &sysfs_rebalance_enabled, 649 - &sysfs_rebalance_work, 649 + &sysfs_rebalance_status, 650 650 sysfs_pd_controller_files(rebalance), 651 651 652 652 &sysfs_moving_ctxts, ··· 707 707 bch2_opt_set_by_id(&c->opts, id, v); 708 708 709 709 if ((id == Opt_background_target || 710 - id == Opt_background_compression) && v) { 711 - bch2_rebalance_add_work(c, S64_MAX); 712 - rebalance_wakeup(c); 713 - } 710 + id == Opt_background_compression) && v) 711 + bch2_set_rebalance_needs_scan(c, 0); 714 712 715 713 ret = size; 716 714 err:
+1 -1
fs/bcachefs/xattr.c
··· 590 590 if (value && 591 591 (opt_id == Opt_background_compression || 592 592 opt_id == Opt_background_target)) 593 - bch2_rebalance_add_work(c, inode->v.i_blocks); 593 + bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum); 594 594 595 595 return bch2_err_class(ret); 596 596 }